Creating a bag-of-words in scikit-learn
# Import CountVectorizer from sklearn.feature_extraction.text import CountVectorizer # Create the token pattern: TOKENS_ALPHANUMERIC TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)' # Fill missing values in df.Position_Extra df.Position_Extra.fillna('', inplace=True) # Instantiate the CountVectorizer: vec_alphanumeric vec_alphanumeric = CountVectorizer(token_pattern=TOKENS_ALPHANUMERIC) # Fit to the data vec_alphanumeric.fit(df.Position_Extra) # Print the number of tokens and first 15 tokens msg = "There are {} tokens in Position_Extra if we split on non-alpha numeric" print(msg.format(len(vec_alphanumeric.get_feature_names()))) print(vec_alphanumeric.get_feature_names()[:15])
Source: campus.datacamp.com