This demo program show how perform data cleansing, plotting the data in pie chart, presenting the positve sentences in word cloud.
Data File Used: https://www.kaggle.com/datasets/gpreda/pfizer-vaccine-tweets
The output:
The code:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 | import pandas as pd import numpy as np import re import seaborn as sns import matplotlib.pyplot as plt from matplotlib import style style.use('ggplot') from textblob import TextBlob from nltk.tokenize import word_tokenize from nltk.stem import PorterStemmer from nltk.corpus import stopwords stop_words = set(stopwords.words('english')) from wordcloud import WordCloud from sklearn.feature_extraction.text import CountVectorizer from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay df = pd.read_csv('vaccination_tweets.csv') text_df = df.drop(['id', 'user_name', 'user_location', 'user_description', 'user_created', 'user_followers', 'user_friends', 'user_favourites', 'user_verified', 'date', 'hashtags', 'source', 'retweets', 'favorites', 'is_retweet'], axis = 1) def data_processing(text): text = text.lower() text = re.sub(r'https\S+|www\S+https\S+','',text, flags=re.MULTILINE) text = re.sub(r'\@w+|\#','',text) text = re.sub(r'[^\w\s]','',text) text_tokens = word_tokenize(text) filtered_text = [w for w in text_tokens if not w in stop_words] return ' '.join(filtered_text) text_df.text = text_df['text'].apply(data_processing) text_df = text_df.drop_duplicates('text') stemmer = PorterStemmer() def stemming(data): text = [stemmer.stem(word) for word in data] return data text_df['text'] = text_df['text'].apply(lambda x: stemming(x)) def polarity(text): return TextBlob(text).sentiment.polarity text_df['polarity'] = text_df['text'].apply(polarity) def sentiment(label): if label <0: return 'Negative' elif label ==0: return 'Neutral' elif label >0: return 'Positive' text_df['sentiment'] = text_df['polarity'].apply(sentiment) fig = plt.figure(figsize=(5,5)) sns.countplot(x='sentiment',data = text_df) fig = plt.figure(figsize=(7,7)) colors = ('yellowgreen', 'gold', 'red') wp = {'linewidth':2, 'edgecolor':'black'} tags = text_df['sentiment'].value_counts() explode = (0.1,0.1,0.1) tags.plot(kind='pie', autopct='%1.1f%%', shadow=True, colors = colors, startangle=90, wedgeprops = wp, explode = explode, label = '') plt.title('Distribution of Sentiments') pos_tweets = text_df[text_df.sentiment == 'Positive'] pos_tweets = pos_tweets.sort_values(['polarity'], ascending = False) print(pos_tweets.head(10)) text = ' '.join([word for word in pos_tweets['text']]) plt.figure(figsize=(20,15), facecolor='None') wordcloud = WordCloud(max_words=500, width=1600, height=800).generate(text) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') plt.title('Most frequent words in positive tweets', fontsize=19) plt.show() |
No comments:
Post a Comment