-
Notifications
You must be signed in to change notification settings - Fork 0
/
Sentiment Analysis-NLP.py
370 lines (272 loc) · 12.7 KB
/
Sentiment Analysis-NLP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
# -*- coding: utf-8 -*-
"""Untitled5.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1sh_pksa1nbUYutVyEGl-vBZh7o3KwowL
"""
from google.colab import drive
drive.mount('/content/drive')
# Commented out IPython magic to ensure Python compatibility.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# %matplotlib inline
df_usa=pd.read_csv("/content/USvideos.csv")
df_ca=pd.read_csv("/content/CAvideos.csv")
df_de=pd.read_csv("/content/DEvideos.csv")
df_fr=pd.read_csv("/content/FRvideos.csv")
df_gb=pd.read_csv("/content/GBvideos.csv")
# the Trending Date and Published Time are not in the Unix date-time format. Let's fix this first.
df_usa['trending_date'] = pd.to_datetime(df_usa['trending_date'], format='%y.%d.%m')
df_usa['publish_time'] = pd.to_datetime(df_usa['publish_time'], format='%Y-%m-%dT%H:%M:%S.%fZ')
# separates date and time into two columns from 'publish_time' column
df_usa.insert(4, 'publish_date', df_usa['publish_time'].dt.date)
df_usa['publish_time'] = df_usa['publish_time'].dt.time
df_usa['publish_date']=pd.to_datetime(df_usa['publish_date'])
#the correlation between the likes, dislikes, comments, and views lets plot a correlation matrix
columns_show=['views', 'likes', 'dislikes', 'comment_count']
f, ax = plt.subplots(figsize=(8, 8))
corr = df_usa[columns_show].corr()
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
square=True, ax=ax,annot=True)
#The below script will give you the total no. of views/comments/likes, and dislikes of a video.
usa_video_views=df_usa.groupby(['video_id'])['views'].agg('sum')
usa_video_likes=df_usa.groupby(['video_id'])['likes'].agg('sum')
usa_video_dislikes=df_usa.groupby(['video_id'])['dislikes'].agg('sum')
usa_video_comment_count=df_usa.groupby(['video_id'])['comment_count'].agg('sum')
#remove the duplicates to get the correct numbers otherwise there will be redundancy.
df_usa_single_day_trend=df_usa.drop_duplicates(subset='video_id', keep=False, inplace=False)
df_usa_multiple_day_trend= df_usa.drop_duplicates(subset='video_id',keep='first',inplace=False)
frames = [df_usa_single_day_trend, df_usa_multiple_day_trend]
df_usa_without_duplicates=pd.concat(frames)
df_usa_comment_disabled=df_usa_without_duplicates[df_usa_without_duplicates['comments_disabled']==True].describe()
df_usa_rating_disabled=df_usa_without_duplicates[df_usa_without_duplicates['ratings_disabled']==True].describe()
df_usa_video_error=df_usa_without_duplicates[df_usa_without_duplicates['video_error_or_removed']==True].describe()
#How many videos were trended only for a single day?
df_usa_single_day_trend.head()
#Videos that were trended for more than 1 day
df_usa_multiple_day_trend.head()
#Which video trended on maximum days and what is the title, likes, dislikes, comments, and views.
df_usa_which_video_trended_maximum_days=df_usa.groupby(by=['video_id'],as_index=False).count().sort_values(by='title',ascending=False).head()
plt.figure(figsize=(10,10))
sns.set_style("whitegrid")
ax = sns.barplot(x=df_usa_which_video_trended_maximum_days['video_id'],y=df_usa_which_video_trended_maximum_days['trending_date'], data=df_usa_which_video_trended_maximum_days)
plt.xlabel("Video Id")
plt.ylabel("Count")
plt.title("Top 5 Videos that trended maximum days in USA")
#Video which were trended for maximum days
df_usa_maximum_views=usa_video_views['sXP6vliZIHI']
df_usa_maximum_likes=usa_video_likes['sXP6vliZIHI']
df_usa_maximum_dislikes=usa_video_dislikes['sXP6vliZIHI']
df_usa_maximum_comment=usa_video_comment_count['sXP6vliZIHI']
#video took maximum no of days to be a Trending Video-
df_usa_multiple_day_trend['Days_taken_to_be_trending_video'] =df_usa_multiple_day_trend['trending_date'] - df_usa_multiple_day_trend['publish_date']
df_usa_multiple_day_trend['Days_taken_to_be_trending_video']= df_usa_multiple_day_trend['Days_taken_to_be_trending_video'] / np.timedelta64(1, 'D')
usa_no_of_days_take_trend=df_usa_multiple_day_trend.sort_values(by='Days_taken_to_be_trending_video',ascending=False).head(5)
plt.figure(figsize=(10,10))
sns.set_style("whitegrid")
ax = sns.barplot(x=usa_no_of_days_take_trend['title'],y=usa_no_of_days_take_trend['Days_taken_to_be_trending_video'], data=usa_no_of_days_take_trend)
plt.xlabel("Video Title")
plt.xticks(rotation=15)
plt.ylabel("No. of Days")
plt.title("Maximum no of days taken by 5 videos to be popular in USA")
#Top 5 Trending Channel in USA
usa_trending_channel=df_usa_without_duplicates.groupby(by=['channel_title'],as_index=False).count().sort_values(by='title',ascending=False).head()
plt.figure(figsize=(10,10))
sns.set_style("whitegrid")
ax = sns.barplot(x=usa_trending_channel['channel_title'],y=usa_trending_channel['video_id'], data=usa_trending_channel)
plt.xlabel("Channel Title")
plt.xticks(rotation=15)
plt.ylabel("Count")
plt.title("Top 5 Trending Channel in USA")
#Top 5 USA_Category_IDs
usa_category_id=df_usa_without_duplicates.groupby(by=['category_id'],as_index=False).count().sort_values(by='title',ascending=False).head(5)
plt.figure(figsize=(7,7))
sns.kdeplot(usa_category_id['category_id']);
plt.xlabel("Category IDs")
plt.xticks(rotation=15)
plt.ylabel("Count")
plt.title("Top 5 Category IDs for USA")
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize
from wordcloud import WordCloud, STOPWORDS
def wc(data,bgcolor,title):
plt.figure(figsize = (100,100))
wc = WordCloud(background_color = bgcolor, max_words = 1000, max_font_size = 50)
wc.generate(' '.join(data))
plt.imshow(wc)
plt.axis('off')
!pip install nltk
import nltk
nltk.download('punkt') # Download the 'punkt' resource
!pip install stop-words
#import the module
from stop_words import get_stop_words
import re
#rest of the code remains as is
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
top_N = 100
#convert list of list into text
#a=''.join(str(r) for v in df_usa['title'] for r in v)
a = df_usa['title'].str.lower().str.cat(sep=' ')
# removes punctuation,numbers and returns list of words
b = re.sub('[^A-Za-z]+', ' ', a)
#remove all the stopwords from the text
stop_words = list(get_stop_words('en'))
nltk_words = list(stopwords.words('english'))
stop_words.extend(nltk_words)
word_tokens = word_tokenize(b)
filtered_sentence = [w for w in word_tokens if not w in stop_words]
filtered_sentence = []
for w in word_tokens:
if w not in stop_words:
filtered_sentence.append(w)
# Remove characters which have length less than 2
without_single_chr = [word for word in filtered_sentence if len(word) > 2]
# Remove numbers
cleaned_data_title = [word for word in without_single_chr if not word.isnumeric()]
# Calculate frequency distribution
word_dist = nltk.FreqDist(cleaned_data_title)
rslt = pd.DataFrame(word_dist.most_common(top_N),
columns=['Word', 'Frequency'])
plt.figure(figsize=(10,10))
sns.set_style("whitegrid")
ax = sns.barplot(x="Word",y="Frequency", data=rslt.head(7))
#WordCloud for Title Column
wc(cleaned_data_title,'black','Common Words' )
#To Count the frequency of words in Tags column.
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
import re
top_N = 100
#convert list of list into text
#a=''.join(str(r) for v in df_usa['title'] for r in v)
tags_lower = df_usa['tags'].str.lower().str.cat(sep=' ')
# removes punctuation,numbers and returns list of words
tags_remove_pun = re.sub('[^A-Za-z]+', ' ', tags_lower)
#remove all the stopwords from the text
stop_words = list(get_stop_words('en'))
nltk_words = list(stopwords.words('english'))
stop_words.extend(nltk_words)
word_tokens_tags = word_tokenize(tags_remove_pun)
filtered_sentence_tags = [w_tags for w_tags in word_tokens_tags if not w_tags in stop_words]
filtered_sentence_tags = []
for w_tags in word_tokens_tags:
if w_tags not in stop_words:
filtered_sentence_tags.append(w_tags)
# Remove characters which have length less than 2
without_single_chr_tags = [word_tags for word_tags in filtered_sentence_tags if len(word_tags) > 2]
# Remove numbers
cleaned_data_tags = [word_tags for word_tags in without_single_chr_tags if not word_tags.isnumeric()]
# Calculate frequency distribution
word_dist_tags = nltk.FreqDist(cleaned_data_tags)
rslt_tags = pd.DataFrame(word_dist_tags.most_common(top_N),
columns=['Word', 'Frequency'])
plt.figure(figsize=(10,10))
sns.set_style("whitegrid")
ax = sns.barplot(x="Word",y="Frequency", data=rslt_tags.head(7))
#WordCloud for Tags
wc(cleaned_data_tags,'black','Common Words' )
#To Count the frequency of words in Description column.
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
import re
top_N = 100
#convert list of list into text
#a=''.join(str(r) for v in df_usa['title'] for r in v)
desc_lower = df_usa['description'].str.lower().str.cat(sep=' ')
# removes punctuation,numbers and returns list of words
desc_remove_pun = re.sub('[^A-Za-z]+', ' ', desc_lower)
#remove all the stopwords from the text
stop_words = list(get_stop_words('en'))
nltk_words = list(stopwords.words('english'))
stop_words.extend(nltk_words)
word_tokens_desc = word_tokenize(desc_remove_pun)
filtered_sentence_desc = [w_desc for w_desc in word_tokens_desc if not w_desc in stop_words]
filtered_sentence_desc = []
for w_desc in word_tokens_desc:
if w_desc not in stop_words:
filtered_sentence_desc.append(w_desc)
# Remove characters which have length less than 2
without_single_chr_desc = [word_desc for word_desc in filtered_sentence_desc if len(word_desc) > 2]
# Remove numbers
cleaned_data_desc = [word_desc for word_desc in without_single_chr_desc if not word_desc.isnumeric()]
# Calculate frequency distribution
word_dist_desc = nltk.FreqDist(cleaned_data_desc)
rslt_desc = pd.DataFrame(word_dist_desc.most_common(top_N),
columns=['Word', 'Frequency'])
#print(rslt_desc)
#plt.style.use('ggplot')
#rslt.plot.bar(rot=0)
plt.figure(figsize=(10,10))
sns.set_style("whitegrid")
ax = sns.barplot(x="Word", y="Frequency", data=rslt_desc.head(7))
#WordCloud for Description column
wc(cleaned_data_desc,'black','Frequent Words' )
#Categorize the Description column into Positive and Negative sentiments using TextBlob
from textblob import TextBlob
bloblist_desc = list()
df_usa_descr_str=df_usa['description'].astype(str)
for row in df_usa_descr_str:
blob = TextBlob(row)
bloblist_desc.append((row,blob.sentiment.polarity, blob.sentiment.subjectivity))
df_usa_polarity_desc = pd.DataFrame(bloblist_desc, columns = ['sentence','sentiment','polarity'])
def f(df_usa_polarity_desc):
if df_usa_polarity_desc['sentiment'] > 0:
val = "Positive"
elif df_usa_polarity_desc['sentiment'] == 0:
val = "Neutral"
else:
val = "Negative"
return val
df_usa_polarity_desc['Sentiment_Type'] = df_usa_polarity_desc.apply(f, axis=1)
plt.figure(figsize=(10,10))
sns.set_style("whitegrid")
ax = sns.countplot(x="Sentiment_Type", data=df_usa_polarity_desc)
#Categorize the Tags column into Positive and Negative sentiments using TextBlob
from textblob import TextBlob
bloblist_tags = list()
df_usa_tags_str=df_usa['tags']
for row in df_usa_tags_str:
blob = TextBlob(row)
bloblist_tags.append((row,blob.sentiment.polarity, blob.sentiment.subjectivity))
df_usa_polarity_tags = pd.DataFrame(bloblist_tags, columns = ['sentence','sentiment','polarity'])
def f_tags(df_usa_polarity_tags):
if df_usa_polarity_tags['sentiment'] > 0:
val = "Positive"
elif df_usa_polarity_tags['sentiment'] == 0:
val = "Neutral"
else:
val = "Negative"
return val
df_usa_polarity_tags['Sentiment_Type'] = df_usa_polarity_tags.apply(f_tags, axis=1)
plt.figure(figsize=(10,10))
sns.set_style("whitegrid")
ax = sns.countplot(x="Sentiment_Type", data=df_usa_polarity_tags)
#Categorize the Title column into Positive and Negative sentiments using TextBlob
from textblob import TextBlob
bloblist_title = list()
df_usa_title_str=df_usa['title']
for row in df_usa_title_str:
blob = TextBlob(row)
bloblist_title.append((row,blob.sentiment.polarity, blob.sentiment.subjectivity))
df_usa_polarity_title = pd.DataFrame(bloblist_title, columns = ['sentence','sentiment','polarity'])
def f_title(df_usa_polarity_title):
if df_usa_polarity_title['sentiment'] > 0:
val = "Positive"
elif df_usa_polarity_title['sentiment'] == 0:
val = "Neutral"
else:
val = "Negative"
return val
df_usa_polarity_title['Sentiment_Type'] = df_usa_polarity_title.apply(f_title, axis=1)
plt.figure(figsize=(10,10))
sns.set_style("whitegrid")
ax = sns.countplot(x="Sentiment_Type", data=df_usa_polarity_title)