Полный образец трубопровода
# Import the hashing vectorizer
from sklearn.feature_extraction.text import HashingVectorizer
# Instantiate the winning model pipeline: pl
pl = Pipeline([
('union', FeatureUnion(
transformer_list = [
('numeric_features', Pipeline([
('selector', get_numeric_data),
('imputer', Imputer())
])),
('text_features', Pipeline([
('selector', get_text_data),
('vectorizer', HashingVectorizer(token_pattern=TOKENS_ALPHANUMERIC,
non_negative=True, norm=None, binary=False,
ngram_range=(1, 2))),
('dim_red', SelectKBest(chi2, chi_k))
]))
]
)),
('int', SparseInteractions(degree=2)),
('scale', MaxAbsScaler()),
('clf', OneVsRestClassifier(LogisticRegression()))
])
# Fit to the training data
pl.fit(X_train, y_train)
# Compute and print accuracy
accuracy = pl.score(X_test, y_test)
print("\nAccuracy on budget dataset: ", accuracy)
josh.ipynb