Thursday, April 7, 2022

Sentiment Analysis

 This demo program show how perform data cleansing, plotting the data in pie chart, presenting the positve sentences in word cloud.

Data File Used: https://www.kaggle.com/datasets/gpreda/pfizer-vaccine-tweets

 

The output:




The code:


 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
from textblob import TextBlob
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

df = pd.read_csv('vaccination_tweets.csv')

text_df = df.drop(['id', 'user_name', 'user_location', 'user_description', 'user_created',
       'user_followers', 'user_friends', 'user_favourites', 'user_verified',
       'date',  'hashtags', 'source', 'retweets', 'favorites',
       'is_retweet'], axis = 1)

def data_processing(text):
    text = text.lower()
    text = re.sub(r'https\S+|www\S+https\S+','',text, flags=re.MULTILINE)
    text = re.sub(r'\@w+|\#','',text)
    text = re.sub(r'[^\w\s]','',text)
    text_tokens = word_tokenize(text)
    filtered_text = [w for w in text_tokens if not w in stop_words]
    return  ' '.join(filtered_text)

text_df.text = text_df['text'].apply(data_processing)

text_df = text_df.drop_duplicates('text')

stemmer = PorterStemmer()
def stemming(data):
    text = [stemmer.stem(word) for word in data]
    return data

text_df['text'] = text_df['text'].apply(lambda x: stemming(x))

def polarity(text):
    return TextBlob(text).sentiment.polarity

text_df['polarity'] = text_df['text'].apply(polarity)

def sentiment(label):
    if label <0:
        return 'Negative'
    elif label ==0:
        return 'Neutral'
    elif label >0:
        return 'Positive'

text_df['sentiment'] = text_df['polarity'].apply(sentiment)

fig = plt.figure(figsize=(5,5))
sns.countplot(x='sentiment',data = text_df)

fig = plt.figure(figsize=(7,7))
colors = ('yellowgreen', 'gold', 'red')
wp = {'linewidth':2, 'edgecolor':'black'}
tags = text_df['sentiment'].value_counts()
explode = (0.1,0.1,0.1)
tags.plot(kind='pie', autopct='%1.1f%%', shadow=True, colors = colors,
        startangle=90, wedgeprops = wp, explode = explode, label = '')
plt.title('Distribution of Sentiments')

pos_tweets = text_df[text_df.sentiment == 'Positive']
pos_tweets = pos_tweets.sort_values(['polarity'], ascending = False)
print(pos_tweets.head(10))

text = ' '.join([word for word in pos_tweets['text']])
plt.figure(figsize=(20,15), facecolor='None')
wordcloud = WordCloud(max_words=500, width=1600, height=800).generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most frequent words in positive tweets', fontsize=19)
plt.show()

No comments:

Post a Comment