import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import requests
Downloading the dataset
= "https://cdn.freecodecamp.org/project-data/books/book-crossings.zip"
url = {"downloadformat": "zip"}
query_parameters
= requests.get(url, params=query_parameters)
response
with open("book-crossings.zip", mode="wb") as file:
file.write(response.content)
import zipfile
with zipfile.ZipFile("book-crossings.zip", mode="r") as archive:
for file in archive.namelist():
"book-crossings/") archive.extractall(
= "book-crossings/BX-Books.csv"
books_filename = "book-crossings/BX-Book-Ratings.csv" ratings_filename
= pd.read_csv(
df_books
books_filename,="ISO-8859-1",
encoding=";",
sep=0,
header=["isbn", "title", "author"],
names=["isbn", "title", "author"],
usecols={"isbn": "str", "title": "str", "author": "str"},
dtype
)
= pd.read_csv(
df_ratings
ratings_filename,="ISO-8859-1",
encoding=";",
sep=0,
header=["user", "isbn", "rating"],
names=["user", "isbn", "rating"],
usecols={"user": "int32", "isbn": "str", "rating": "float32"},
dtype )
Exploring the dataset
df_books
isbn | title | author | |
---|---|---|---|
0 | 0195153448 | Classical Mythology | Mark P. O. Morford |
1 | 0002005018 | Clara Callan | Richard Bruce Wright |
2 | 0060973129 | Decision in Normandy | Carlo D'Este |
3 | 0374157065 | Flu: The Story of the Great Influenza Pandemic... | Gina Bari Kolata |
4 | 0393045218 | The Mummies of Urumchi | E. J. W. Barber |
... | ... | ... | ... |
271374 | 0440400988 | There's a Bat in Bunk Five | Paula Danziger |
271375 | 0525447644 | From One to One Hundred | Teri Sloat |
271376 | 006008667X | Lily Dale : The True Story of the Town that Ta... | Christine Wicker |
271377 | 0192126040 | Republic (World's Classics) | Plato |
271378 | 0767409752 | A Guided Tour of Rene Descartes' Meditations o... | Christopher Biffle |
271379 rows × 3 columns
df_ratings
user | isbn | rating | |
---|---|---|---|
0 | 276725 | 034545104X | 0.0 |
1 | 276726 | 0155061224 | 5.0 |
2 | 276727 | 0446520802 | 0.0 |
3 | 276729 | 052165615X | 3.0 |
4 | 276729 | 0521795028 | 6.0 |
... | ... | ... | ... |
1149775 | 276704 | 1563526298 | 9.0 |
1149776 | 276706 | 0679447156 | 0.0 |
1149777 | 276709 | 0515107662 | 10.0 |
1149778 | 276721 | 0590442449 | 10.0 |
1149779 | 276723 | 05162443314 | 8.0 |
1149780 rows × 3 columns
# Remove from the dataset users with less than 200 ratings and books with less than 100 ratings
# Function adapted from
# https://datascienceplus.com/building-a-book-recommender-system-the-basics-knn-and-matrix-factorization/
= df_ratings["user"].value_counts()
counts_user = df_ratings["isbn"].value_counts()
counts_isbn = counts_user[counts_user >= 200].index
counts_user = counts_isbn[counts_isbn >= 100].index
counts_isbn = df_ratings.loc[
df_ratings_filtered "user"].isin(counts_user.values))
(df_ratings[& (df_ratings["isbn"].isin(counts_isbn.values))
]
df_ratings_filtered.shape
(49781, 3)
Joining the two dataframes
= pd.merge(df_books, df_ratings_filtered, on="isbn")
df_books_ratings
"author"] = df_books_ratings["author"].str.title()
df_books_ratings[
df_books_ratings
isbn | title | author | user | rating | |
---|---|---|---|---|---|
0 | 0440234743 | The Testament | John Grisham | 277478 | 0.0 |
1 | 0440234743 | The Testament | John Grisham | 2977 | 0.0 |
2 | 0440234743 | The Testament | John Grisham | 3363 | 0.0 |
3 | 0440234743 | The Testament | John Grisham | 7346 | 9.0 |
4 | 0440234743 | The Testament | John Grisham | 9856 | 0.0 |
... | ... | ... | ... | ... | ... |
49512 | 0515135739 | Eleventh Hour: An FBI Thriller (FBI Thriller (... | Catherine Coulter | 236283 | 0.0 |
49513 | 0515135739 | Eleventh Hour: An FBI Thriller (FBI Thriller (... | Catherine Coulter | 251613 | 0.0 |
49514 | 0515135739 | Eleventh Hour: An FBI Thriller (FBI Thriller (... | Catherine Coulter | 252071 | 0.0 |
49515 | 0515135739 | Eleventh Hour: An FBI Thriller (FBI Thriller (... | Catherine Coulter | 256407 | 0.0 |
49516 | 0515135739 | Eleventh Hour: An FBI Thriller (FBI Thriller (... | Catherine Coulter | 262399 | 0.0 |
49517 rows × 5 columns
Dropping duplicates and making a pivot table for the KNN algorithm
"title", "user"], keep="first", inplace=True)
df_books_ratings.drop_duplicates([
= df_books_ratings.pivot(index="title", columns="user", values="rating").fillna(0)
pivot
pivot.shape
(673, 888)
pivot
user | 254 | 2276 | 2766 | 2977 | 3363 | 4017 | 4385 | 6242 | 6251 | 6323 | ... | 274004 | 274061 | 274301 | 274308 | 274808 | 275970 | 277427 | 277478 | 277639 | 278418 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
title | |||||||||||||||||||||
1984 | 9.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1st to Die: A Novel | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2nd Chance | 0.0 | 10.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 Blondes | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
A Beautiful Mind: The Life of Mathematical Genius and Nobel Laureate John Nash | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
Without Remorse | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
Year of Wonders | 0.0 | 0.0 | 0.0 | 7.0 | 0.0 | 0.0 | 0.0 | 7.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
You Belong To Me | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
\O\" Is for Outlaw" | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 8.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
673 rows × 888 columns
Converting the pivot table above to a sparse matrix that is efficient for arithmetic operations, row slicing, and matrix-vector products. It is particularly useful when dealing with large matrices where most elements are zero, as it saves memory by only storing non-zero elements.
= csr_matrix(pivot.values)
matrix
matrix
<Compressed Sparse Row sparse matrix of dtype 'float32'
with 12423 stored elements and shape (673, 888)>
= NearestNeighbors(algorithm="brute", metric="cosine")
model
model.fit(matrix)
NearestNeighbors(algorithm='brute', metric='cosine')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
NearestNeighbors(algorithm='brute', metric='cosine')
Getting the results
# Function to return recommended books
# adapted from
# https://datascienceplus.com/building-a-book-recommender-system-the-basics-knn-and-matrix-factorization/
def get_recommends(book=""):
= model.kneighbors(
distances, indices 1, -1), n_neighbors=5
pivot.loc[book, :].values.reshape(
)= []
recommended_books
recommended_books.append(book)
recommended_books.append([])for i in range(1, len(distances.flatten())):
1].insert(
recommended_books[0, [pivot.index[indices.flatten()[i]], distances.flatten()[i]]
)
return recommended_books
import pprint
= get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
recommends
pprint.pprint(recommends)
["Where the Heart Is (Oprah's Book Club (Paperback))",
[['The Weight of Water', np.float32(0.77085835)],
['The Surgeon', np.float32(0.7699411)],
['I Know This Much Is True', np.float32(0.7677075)],
['The Lovely Bones: A Novel', np.float32(0.7234864)]]]
Testing the results
def test_book_recommendation():
= True
test_pass
if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
= False
test_pass
= [
recommended_books "I'll Be Seeing You",
"The Weight of Water",
"The Surgeon",
"I Know This Much Is True",
]= [0.8, 0.77, 0.77, 0.77]
recommended_books_dist
for i in range(2):
if recommends[1][i][0] not in recommended_books:
= False
test_pass if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
= False
test_pass if test_pass:
print("You passed the challenge! 🎉🎉🎉🎉🎉")
else:
print("You haven't passed yet. Keep trying!")
test_book_recommendation()
You passed the challenge! 🎉🎉🎉🎉🎉