Bag of Words and TF-IDF: Make a Powerful Movie Recommendation System
🕒 2025-04-23 05:40:43.193235Main content: Movie Recommendation System
What will you learn?
Bag of Words model
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
paragraph = """Good people live good lives. Bad people attempt to live with what they desire, but they have a bad life."""
wordnet=WordNetLemmatizer()
sentences = nltk.sent_tokenize(paragraph)
final_text = []
for i in range(len(sentences)):
review = re.sub('[^a-zA-Z]', ' ', sentences[i])
review = review.lower()
review = review.split()
review = [wordnet.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
review = ' '.join(review)
final_text.append(review)
print(final_text)
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
bag = cv.fit_transform(final_text).toarray()
print(bag)
TF-IDF model | Natural Language Processing
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
paragraph = """Good people live good lives. Bad people attempt to live with what they desire, but they have a bad life."""
wordnet=WordNetLemmatizer()
sentences = nltk.sent_tokenize(paragraph)
final_text = []
for i in range(len(sentences)):
review = re.sub('[^a-zA-Z]', ' ', sentences[i])
review = review.lower()
review = review.split()
lemmatizedWord = wordnet.lemmatize(word)
list_stopwords = set(stopwords.words('english')
review = [lemmatizedWord for word in review if not word in list_stopwords]
review = ' '.join(review)
final_text.append(review)
print(final_text)
# Creating the TF-IDF model
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(norm='l2')
tfidf = cv.fit_transform(final_text).toarray()
print(tfidf)
# Display using pandas with features name
import pandas as pd
tfIdfVectorizer=TfidfVectorizer()
tfIdf = tfIdfVectorizer.fit_transform(final_text)
df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF*IDF"])
df = df.sort_values('TF*IDF', ascending=False)
print(df)
Movie Recommendation System using TF-IDF model | Natural Language Processing
Movie Recommendation System using TF-IDF
from google.colab import drive
drive.mount("/content/drive")
cd 'drive/My Drive'
import pandas as pd
data = pd.read_csv("wiki_movie_plots_deduped.csv")
data.head()
data.shape
import numpy as np
np.unique(data['Origin/Ethnicity'])
data2 = data.loc[(data['Origin/Ethnicity']=='American') & (data['Release Year']>2015)]
len(data2)
my_data = pd.DataFrame(data2)
my_data.tail()
finaldata = my_data[["Title", "Plot"]]
finaldata = finaldata.set_index('Title')
finaldata.head(50)
finaldata["Plot"]["Criminal"]
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemmatizer = WordNetLemmatizer()
def preprocess_sentences(text):
text = text.lower()
words = nltk.word_tokenize(text)
my_sent=[lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')]
finalsent = ' '.join(my_sent)
finalsent = finalsent.replace("n't", " not")
finalsent = finalsent.replace("'m", " am")
finalsent = finalsent.replace("'s", " is")
finalsent = finalsent.replace("'re", " are")
finalsent = finalsent.replace("'ll", " will")
finalsent = finalsent.replace("'ve", " have")
finalsent = finalsent.replace("'d", " would")
return finalsent
finaldata["new_plot"]= finaldata["Plot"].apply(preprocess_sentences)
finaldata.head()
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf_movieid = tfidf.fit_transform((finaldata["new_plot"]))
# Finding cosine similarity between vectors
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(tfidf_movieid, tfidf_movieid)
indices = pd.Series(finaldata.index)
def recommendations(title, cosine_sim = similarity):
try:
index = indices[indices == title].index[0]
print(index)
similarity_scores = pd.Series(cosine_sim[index]).sort_values(ascending = False)
print(similarity_scores)
top_10_movies = list(similarity_scores.iloc[1:11].index)
print(top_10_movies)
recommended_movies = [list(finaldata.index)[i] for i in top_10_movies]
return recommended_movies
except:
print("No movie name found")
recommendations("Harry Potter and the Chamber of Secrets")
Output:
recommendations("Spider-Man")
Output:
recommendations("Ice Age")
Output:
Conclusion
In this post, I have clearly explained the Bag of Words and the TF-IDF models which are key concepts in Natural Language Processing. I have made a complete project on the Movie Recommendation System with the TF-IDF model. I hope you like this project.
If you have any questions, ask me in the comment section. I will be back to you as soon as possible. Keep learning.
Comments
Loading comments...
Leave a Comment