In [1]:
from sklearn.datasets import load_files
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer as CV
from sklearn.feature_extraction import text
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
In [2]:
lily_stopwords = ['xa0', 'u2026', 'com', 'http', 'https', 'got', 'twitter', 'pic', 'lily', 'herman', 
                  'lkherman', 'www', 'xa0pic', 'u2026pic', 'u2019s', 'u201d', 'gonna', 'utm_source', 
                  'utm_medium', 'u2019', '07', '03', '04', 'html', '01', '20', 'nhttp', 'u2019t', 'u3000', 
                  'status', 'just', 'doing', 'thread', 'tweet']
lily_stopwords = text.ENGLISH_STOP_WORDS.union(lily_stopwords)
In [3]:
lda = LDA(n_topics=10, max_iter=5, learning_method='online', learning_offset=50., random_state=0)
vectorizer = CV(max_df=0.95, min_df=2, max_features=1000, stop_words=lily_stopwords, decode_error='ignore')
In [7]:
def do_LDA(directory, vectorizer, lda, n):
    '''Does LDA topic modeling on a given dataset indicated by directory, using inputted vectorizer and 
    lda model. Calls print_top_words to make the results coherent. Returns the matrix so that the 
    visualization can be generated.'''
    train = load_files(directory)
    matrix = vectorizer.fit_transform(train.data)
    lda.fit(matrix)
    features = vectorizer.get_feature_names()
    return matrix
In [8]:
lily_matrix = do_LDA('lkherman_3_text', vectorizer, lda, 15)
pyLDAvis.sklearn.prepare(lda, lily_matrix, vectorizer)
Out[8]: