This demo program how to improve the accuracy score of logistic regression.
The Output:
The code:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 | import pandas as pd import numpy as np import re import seaborn as sns import matplotlib.pyplot as plt from matplotlib import style style.use('ggplot') from textblob import TextBlob from nltk.tokenize import word_tokenize from nltk.stem import PorterStemmer from nltk.corpus import stopwords stop_words = set(stopwords.words('english')) from wordcloud import WordCloud from sklearn.feature_extraction.text import CountVectorizer from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay df = pd.read_csv(r'c:\sentiment\vaccination_tweets.csv') text_df = df.drop(['id', 'user_name', 'user_location', 'user_description', 'user_created', 'user_followers', 'user_friends', 'user_favourites', 'user_verified', 'date', 'hashtags', 'source', 'retweets', 'favorites', 'is_retweet'], axis = 1) def data_processing(text): text = text.lower() text = re.sub(r'https\S+|www\S+https\S+','',text, flags=re.MULTILINE) text = re.sub(r'\@w+|\#','',text) text = re.sub(r'[^\w\s]','',text) text_tokens = word_tokenize(text) filtered_text = [w for w in text_tokens if not w in stop_words] return ' '.join(filtered_text) text_df.text = text_df['text'].apply(data_processing) text_df = text_df.drop_duplicates('text') stemmer = PorterStemmer() def stemming(data): text = [stemmer.stem(word) for word in data] return data text_df['text'] = text_df['text'].apply(lambda x: stemming(x)) def polarity(text): return TextBlob(text).sentiment.polarity text_df['polarity'] = text_df['text'].apply(polarity) def sentiment(label): if label <0: return 'Negative' elif label ==0: return 'Neutral' elif label >0: return 'Positive' text_df['sentiment'] = text_df['polarity'].apply(sentiment) vect = CountVectorizer(ngram_range=(1,2)).fit(text_df['text']) feature_names = vect.get_feature_names_out() print('Number of features:{}\n'.format(len(feature_names))) print('First 20 features: {}'.format(feature_names[:20])) X = text_df['text'] Y = text_df['sentiment'] X = vect.transform(X) x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42) import warnings warnings.filterwarnings('ignore') from sklearn.model_selection import GridSearchCV param_grid={'C':[0.001, 0.01, 0.1, 1, 10]} grid = GridSearchCV(LogisticRegression(), param_grid) grid.fit(x_train, y_train) print('Best parameters:', grid.best_params_) y_pred = grid.predict(x_test) logreg_acc = accuracy_score(y_pred, y_test) print('Test accuracy: {:.2f}%'.format(logreg_acc*100)) print(confusion_matrix(y_test, y_pred)) print('\n') print(classification_report(y_test, y_pred)) |
No comments:
Post a Comment