Back to Modin

Demonstrating NLTK Modin Interoperability

examples/jupyter/integrations/NLTK.ipynb

0.37.14.8 KB
Original Source

Demonstrating NLTK Modin Interoperability

All the examples in this section are taken / adapted from https://www.kirenz.com/post/2021-12-11-text-mining-and-sentiment-analysis-with-nltk-and-pandas-in-python/text-mining-and-sentiment-analysis-with-nltk-and-pandas-in-python/

python
import modin.pandas as pd
import pandas
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
python
# Import some Tweets from Barack Obama 
modin_df = pd.read_csv("https://raw.githubusercontent.com/kirenz/twitter-tweepy/main/tweets-obama.csv")
modin_df.head(3)
python
modin_df['text'] = modin_df['text'].astype(str).str.lower()
modin_df.head(3)
python
regexp = RegexpTokenizer('\w+')

modin_df['text_token']=modin_df['text'].apply(regexp.tokenize)
modin_df.head(3)
python
nltk.download('stopwords')
python
# Make a list of english stopwords
stopwords = nltk.corpus.stopwords.words("english")

# Extend the list with your own custom stopwords
my_stopwords = ['https']
stopwords.extend(my_stopwords)
python
# Remove stopwords
modin_df['text_token'] = modin_df['text_token'].apply(lambda x: [item for item in x if item not in stopwords])
modin_df.head(3)
python
modin_df['text_string'] = modin_df['text_token'].apply(lambda x: ' '.join([item for item in x if len(item)>2]))
modin_df[['text', 'text_token', 'text_string']].head()
python
nltk.download('punkt')
python
all_words = ' '.join([word for word in modin_df['text_string']])
tokenized_words = nltk.tokenize.word_tokenize(all_words)
python
from nltk.probability import FreqDist

fdist = FreqDist(tokenized_words)
fdist
python
modin_df['text_string_fdist'] = modin_df['text_token'].apply(lambda x: ' '.join([item for item in x if fdist[item] >= 1 ]))
modin_df[['text', 'text_token', 'text_string', 'text_string_fdist']].head()
python
#lemmatization
nltk.download('wordnet')
nltk.download('omw-1.4')
python
from nltk.stem import WordNetLemmatizer

wordnet_lem = WordNetLemmatizer()

modin_df['text_string_lem'] = modin_df['text_string_fdist'].apply(wordnet_lem.lemmatize)
python
# check if the columns are equal
modin_df['is_equal']= (modin_df['text_string_fdist']==modin_df['text_string_lem'])
python
# show level count
modin_df.is_equal.value_counts()
python
all_words_lem = ' '.join([word for word in modin_df['text_string_lem']])
python
%matplotlib inline
import matplotlib.pyplot as plt
from wordcloud import WordCloud

wordcloud = WordCloud(width=600, 
                     height=400, 
                     random_state=2, 
                     max_font_size=100).generate(all_words_lem)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off');

Replicating NLTK workflow with pandas

python
# Import some Tweets from Barack Obama as pandas df
pandas_df = pandas.read_csv("https://raw.githubusercontent.com/kirenz/twitter-tweepy/main/tweets-obama.csv")
python
pandas_df['text'] = pandas_df['text'].astype(str).str.lower()
pandas_df.head(3)
python
regexp = RegexpTokenizer('\w+')

pandas_df['text_token']=pandas_df['text'].apply(regexp.tokenize)
pandas_df.head(3)
python
# Remove stopwords
pandas_df['text_token'] = pandas_df['text_token'].apply(lambda x: [item for item in x if item not in stopwords])
pandas_df.head(3)
python
pandas_df['text_string'] = pandas_df['text_token'].apply(lambda x: ' '.join([item for item in x if len(item)>2]))
pandas_df[['text', 'text_token', 'text_string']].head()
python
all_words = ' '.join([word for word in pandas_df['text_string']])
tokenized_words = nltk.tokenize.word_tokenize(all_words)
python
from nltk.probability import FreqDist

fdist = FreqDist(tokenized_words)
fdist
python
pandas_df['text_string_fdist'] = pandas_df['text_token'].apply(lambda x: ' '.join([item for item in x if fdist[item] >= 1 ]))
pandas_df[['text', 'text_token', 'text_string', 'text_string_fdist']].head()
python
from nltk.stem import WordNetLemmatizer

wordnet_lem = WordNetLemmatizer()

pandas_df['text_string_lem'] = pandas_df['text_string_fdist'].apply(wordnet_lem.lemmatize)
python
# check if the columns are equal
pandas_df['is_equal']= (pandas_df['text_string_fdist']==pandas_df['text_string_lem'])
python
# show level count
pandas_df.is_equal.value_counts()
python
all_words_lem = ' '.join([word for word in pandas_df['text_string_lem']])
python
%matplotlib inline
import matplotlib.pyplot as plt
from wordcloud import WordCloud

wordcloud = WordCloud(width=600, 
                     height=400, 
                     random_state=2, 
                     max_font_size=100).generate(all_words_lem)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off');