import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import requests
= "https://cdn.freecodecamp.org/project-data/books/book-crossings.zip"
url = {"downloadformat": "zip"}
query_parameters
= requests.get(url, params=query_parameters)
response
with open("book-crossings.zip", mode="wb") as file:
file.write(response.content)
import zipfile
with zipfile.ZipFile("book-crossings.zip", mode="r") as archive:
for file in archive.namelist():
"book-crossings/") archive.extractall(
= "book-crossings/BX-Books.csv"
books_filename = "book-crossings/BX-Book-Ratings.csv" ratings_filename
= pd.read_csv(
df_books
books_filename,="ISO-8859-1",
encoding=";",
sep=0,
header=["isbn", "title", "author"],
names=["isbn", "title", "author"],
usecols={"isbn": "str", "title": "str", "author": "str"},
dtype
)
= pd.read_csv(
df_ratings
ratings_filename,="ISO-8859-1",
encoding=";",
sep=0,
header=["user", "isbn", "rating"],
names=["user", "isbn", "rating"],
usecols={"user": "int32", "isbn": "str", "rating": "float32"},
dtype )
df_books
isbn | title | author | |
---|---|---|---|
0 | 0195153448 | Classical Mythology | Mark P. O. Morford |
1 | 0002005018 | Clara Callan | Richard Bruce Wright |
2 | 0060973129 | Decision in Normandy | Carlo D'Este |
3 | 0374157065 | Flu: The Story of the Great Influenza Pandemic... | Gina Bari Kolata |
4 | 0393045218 | The Mummies of Urumchi | E. J. W. Barber |
... | ... | ... | ... |
271374 | 0440400988 | There's a Bat in Bunk Five | Paula Danziger |
271375 | 0525447644 | From One to One Hundred | Teri Sloat |
271376 | 006008667X | Lily Dale : The True Story of the Town that Ta... | Christine Wicker |
271377 | 0192126040 | Republic (World's Classics) | Plato |
271378 | 0767409752 | A Guided Tour of Rene Descartes' Meditations o... | Christopher Biffle |
271379 rows × 3 columns
# Remove from the dataset users with less than 200 ratings and books with less than 100 ratings
# Function adapted from
# https://datascienceplus.com/building-a-book-recommender-system-the-basics-knn-and-matrix-factorization/
= df_ratings["user"].value_counts()
counts_user = df_ratings["isbn"].value_counts()
counts_isbn = counts_user[counts_user >= 200].index
counts_user = counts_isbn[counts_isbn >= 100].index
counts_isbn = df_ratings.loc[
df_ratings_filtered "user"].isin(counts_user.values))
(df_ratings[& (df_ratings["isbn"].isin(counts_isbn.values))
]
df_ratings_filtered.shape
(49781, 3)
= pd.merge(df_books, df_ratings_filtered, on="isbn")
df_books_ratings
"author"] = df_books_ratings["author"].str.title()
df_books_ratings[
df_books_ratings
isbn | title | author | user | rating | |
---|---|---|---|---|---|
0 | 0440234743 | The Testament | John Grisham | 277478 | 0.0 |
1 | 0440234743 | The Testament | John Grisham | 2977 | 0.0 |
2 | 0440234743 | The Testament | John Grisham | 3363 | 0.0 |
3 | 0440234743 | The Testament | John Grisham | 7346 | 9.0 |
4 | 0440234743 | The Testament | John Grisham | 9856 | 0.0 |
... | ... | ... | ... | ... | ... |
49512 | 0515135739 | Eleventh Hour: An FBI Thriller (FBI Thriller (... | Catherine Coulter | 236283 | 0.0 |
49513 | 0515135739 | Eleventh Hour: An FBI Thriller (FBI Thriller (... | Catherine Coulter | 251613 | 0.0 |
49514 | 0515135739 | Eleventh Hour: An FBI Thriller (FBI Thriller (... | Catherine Coulter | 252071 | 0.0 |
49515 | 0515135739 | Eleventh Hour: An FBI Thriller (FBI Thriller (... | Catherine Coulter | 256407 | 0.0 |
49516 | 0515135739 | Eleventh Hour: An FBI Thriller (FBI Thriller (... | Catherine Coulter | 262399 | 0.0 |
49517 rows × 5 columns
"title", "user"], keep="first", inplace=True)
df_books_ratings.drop_duplicates([
= df_books_ratings.pivot(index="title", columns="user", values="rating").fillna(0)
pivot
pivot.shape
(673, 888)
pivot
user | 254 | 2276 | 2766 | 2977 | 3363 | 4017 | 4385 | 6242 | 6251 | 6323 | ... | 274004 | 274061 | 274301 | 274308 | 274808 | 275970 | 277427 | 277478 | 277639 | 278418 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
title | |||||||||||||||||||||
1984 | 9.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1st to Die: A Novel | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2nd Chance | 0.0 | 10.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 Blondes | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
A Beautiful Mind: The Life of Mathematical Genius and Nobel Laureate John Nash | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
Without Remorse | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
Year of Wonders | 0.0 | 0.0 | 0.0 | 7.0 | 0.0 | 0.0 | 0.0 | 7.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
You Belong To Me | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
\O\" Is for Outlaw" | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 8.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
673 rows × 888 columns
= csr_matrix(pivot.values)
matrix
matrix
<Compressed Sparse Row sparse matrix of dtype 'float32'
with 12423 stored elements and shape (673, 888)>
= NearestNeighbors(algorithm="brute", metric="cosine")
model model.fit(matrix)
NearestNeighbors(algorithm='brute', metric='cosine')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
NearestNeighbors(algorithm='brute', metric='cosine')
# function to return recommended books - this will be tested
# Function adapted from
# https://datascienceplus.com/building-a-book-recommender-system-the-basics-knn-and-matrix-factorization/
def get_recommends(book=""):
= model.kneighbors(
distances, indices 1, -1), n_neighbors=5
pivot.loc[book, :].values.reshape(
)= []
recommended_books
recommended_books.append(book)
recommended_books.append([])for i in range(1, len(distances.flatten())):
1].insert(
recommended_books[0, [pivot.index[indices.flatten()[i]], distances.flatten()[i]]
)
return recommended_books
= get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
books import pprint
pprint.pprint(books)
def test_book_recommendation():
= True
test_pass = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
recommends if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
= False
test_pass = [
recommended_books "I'll Be Seeing You",
"The Weight of Water",
"The Surgeon",
"I Know This Much Is True",
]= [0.8, 0.77, 0.77, 0.77]
recommended_books_dist for i in range(2):
if recommends[1][i][0] not in recommended_books:
= False
test_pass if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
= False
test_pass if test_pass:
print("You passed the challenge! 🎉🎉🎉🎉🎉")
else:
print("You haven't passed yet. Keep trying!")
test_book_recommendation()
["Where the Heart Is (Oprah's Book Club (Paperback))",
[['The Weight of Water', np.float32(0.77085835)],
['The Surgeon', np.float32(0.7699411)],
['I Know This Much Is True', np.float32(0.7677075)],
['The Lovely Bones: A Novel', np.float32(0.7234864)]]]
You passed the challenge! 🎉🎉🎉🎉🎉