from sklearn.datasets import load_files
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer as CV
from sklearn.feature_extraction import text
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
tomi_stopwords = ['xa0', 'u2026', 'com', 'http', 'https', 'got', 'twitter', 'pic', 'tomi', 'lahren',
'tomilahren', 'www', 'xa0pic', 'u2026pic', 'u2019s', 'u201d', 'gonna', 'utm_source',
'utm_medium', 'u2019', '07', '03', '04', 'html', '01', '20', 'nhttp', 'u2019t', 'u3000']
tomi_stopwords = text.ENGLISH_STOP_WORDS.union(tomi_stopwords)
lda = LDA(n_topics=10, max_iter=5, learning_method='online', learning_offset=50., random_state=0)
vectorizer = CV(max_df=0.95, min_df=2, max_features=1000, stop_words=tomi_stopwords, decode_error='ignore')
def do_LDA(directory, vectorizer, lda, n):
'''Does LDA topic modeling on a given dataset indicated by directory, using inputted vectorizer and
lda model. Calls print_top_words to make the results coherent. Returns the matrix so that the
visualization can be generated.'''
train = load_files(directory)
matrix = vectorizer.fit_transform(train.data)
lda.fit(matrix)
features = vectorizer.get_feature_names()
return matrix
tomi_matrix = do_LDA('tomilahren_stuff/tomilahren_1_text', vectorizer, lda, 15)
pyLDAvis.sklearn.prepare(lda, tomi_matrix, vectorizer)