examples/jupyter/integrations/NLTK.ipynb
import modin.pandas as pd
import pandas
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
# Import some Tweets from Barack Obama
modin_df = pd.read_csv("https://raw.githubusercontent.com/kirenz/twitter-tweepy/main/tweets-obama.csv")
modin_df.head(3)
modin_df['text'] = modin_df['text'].astype(str).str.lower()
modin_df.head(3)
regexp = RegexpTokenizer('\w+')
modin_df['text_token']=modin_df['text'].apply(regexp.tokenize)
modin_df.head(3)
nltk.download('stopwords')
# Make a list of english stopwords
stopwords = nltk.corpus.stopwords.words("english")
# Extend the list with your own custom stopwords
my_stopwords = ['https']
stopwords.extend(my_stopwords)
# Remove stopwords
modin_df['text_token'] = modin_df['text_token'].apply(lambda x: [item for item in x if item not in stopwords])
modin_df.head(3)
modin_df['text_string'] = modin_df['text_token'].apply(lambda x: ' '.join([item for item in x if len(item)>2]))
modin_df[['text', 'text_token', 'text_string']].head()
nltk.download('punkt')
all_words = ' '.join([word for word in modin_df['text_string']])
tokenized_words = nltk.tokenize.word_tokenize(all_words)
from nltk.probability import FreqDist
fdist = FreqDist(tokenized_words)
fdist
modin_df['text_string_fdist'] = modin_df['text_token'].apply(lambda x: ' '.join([item for item in x if fdist[item] >= 1 ]))
modin_df[['text', 'text_token', 'text_string', 'text_string_fdist']].head()
#lemmatization
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
wordnet_lem = WordNetLemmatizer()
modin_df['text_string_lem'] = modin_df['text_string_fdist'].apply(wordnet_lem.lemmatize)
# check if the columns are equal
modin_df['is_equal']= (modin_df['text_string_fdist']==modin_df['text_string_lem'])
# show level count
modin_df.is_equal.value_counts()
all_words_lem = ' '.join([word for word in modin_df['text_string_lem']])
%matplotlib inline
import matplotlib.pyplot as plt
from wordcloud import WordCloud
wordcloud = WordCloud(width=600,
height=400,
random_state=2,
max_font_size=100).generate(all_words_lem)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off');
# Import some Tweets from Barack Obama as pandas df
pandas_df = pandas.read_csv("https://raw.githubusercontent.com/kirenz/twitter-tweepy/main/tweets-obama.csv")
pandas_df['text'] = pandas_df['text'].astype(str).str.lower()
pandas_df.head(3)
regexp = RegexpTokenizer('\w+')
pandas_df['text_token']=pandas_df['text'].apply(regexp.tokenize)
pandas_df.head(3)
# Remove stopwords
pandas_df['text_token'] = pandas_df['text_token'].apply(lambda x: [item for item in x if item not in stopwords])
pandas_df.head(3)
pandas_df['text_string'] = pandas_df['text_token'].apply(lambda x: ' '.join([item for item in x if len(item)>2]))
pandas_df[['text', 'text_token', 'text_string']].head()
all_words = ' '.join([word for word in pandas_df['text_string']])
tokenized_words = nltk.tokenize.word_tokenize(all_words)
from nltk.probability import FreqDist
fdist = FreqDist(tokenized_words)
fdist
pandas_df['text_string_fdist'] = pandas_df['text_token'].apply(lambda x: ' '.join([item for item in x if fdist[item] >= 1 ]))
pandas_df[['text', 'text_token', 'text_string', 'text_string_fdist']].head()
from nltk.stem import WordNetLemmatizer
wordnet_lem = WordNetLemmatizer()
pandas_df['text_string_lem'] = pandas_df['text_string_fdist'].apply(wordnet_lem.lemmatize)
# check if the columns are equal
pandas_df['is_equal']= (pandas_df['text_string_fdist']==pandas_df['text_string_lem'])
# show level count
pandas_df.is_equal.value_counts()
all_words_lem = ' '.join([word for word in pandas_df['text_string_lem']])
%matplotlib inline
import matplotlib.pyplot as plt
from wordcloud import WordCloud
wordcloud = WordCloud(width=600,
height=400,
random_state=2,
max_font_size=100).generate(all_words_lem)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off');