Book Recommendation Model

python
Published

January 1, 2024

import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import requests

url = "https://cdn.freecodecamp.org/project-data/books/book-crossings.zip"
query_parameters = {"downloadformat": "zip"}

response = requests.get(url, params=query_parameters)

with open("book-crossings.zip", mode="wb") as file:
    file.write(response.content)
import zipfile

with zipfile.ZipFile("book-crossings.zip", mode="r") as archive:
    for file in archive.namelist():
        archive.extractall("book-crossings/")
books_filename = "book-crossings/BX-Books.csv"
ratings_filename = "book-crossings/BX-Book-Ratings.csv"
df_books = pd.read_csv(
    books_filename,
    encoding="ISO-8859-1",
    sep=";",
    header=0,
    names=["isbn", "title", "author"],
    usecols=["isbn", "title", "author"],
    dtype={"isbn": "str", "title": "str", "author": "str"},
)

df_ratings = pd.read_csv(
    ratings_filename,
    encoding="ISO-8859-1",
    sep=";",
    header=0,
    names=["user", "isbn", "rating"],
    usecols=["user", "isbn", "rating"],
    dtype={"user": "int32", "isbn": "str", "rating": "float32"},
)
df_books
isbn title author
0 0195153448 Classical Mythology Mark P. O. Morford
1 0002005018 Clara Callan Richard Bruce Wright
2 0060973129 Decision in Normandy Carlo D'Este
3 0374157065 Flu: The Story of the Great Influenza Pandemic... Gina Bari Kolata
4 0393045218 The Mummies of Urumchi E. J. W. Barber
... ... ... ...
271374 0440400988 There's a Bat in Bunk Five Paula Danziger
271375 0525447644 From One to One Hundred Teri Sloat
271376 006008667X Lily Dale : The True Story of the Town that Ta... Christine Wicker
271377 0192126040 Republic (World's Classics) Plato
271378 0767409752 A Guided Tour of Rene Descartes' Meditations o... Christopher Biffle

271379 rows × 3 columns

# Remove from the dataset users with less than 200 ratings and books with less than 100 ratings

# Function adapted from
# https://datascienceplus.com/building-a-book-recommender-system-the-basics-knn-and-matrix-factorization/
counts_user = df_ratings["user"].value_counts()
counts_isbn = df_ratings["isbn"].value_counts()
counts_user = counts_user[counts_user >= 200].index
counts_isbn = counts_isbn[counts_isbn >= 100].index
df_ratings_filtered = df_ratings.loc[
    (df_ratings["user"].isin(counts_user.values))
    & (df_ratings["isbn"].isin(counts_isbn.values))
]

df_ratings_filtered.shape
(49781, 3)
df_books_ratings = pd.merge(df_books, df_ratings_filtered, on="isbn")

df_books_ratings["author"] = df_books_ratings["author"].str.title()

df_books_ratings
isbn title author user rating
0 0440234743 The Testament John Grisham 277478 0.0
1 0440234743 The Testament John Grisham 2977 0.0
2 0440234743 The Testament John Grisham 3363 0.0
3 0440234743 The Testament John Grisham 7346 9.0
4 0440234743 The Testament John Grisham 9856 0.0
... ... ... ... ... ...
49512 0515135739 Eleventh Hour: An FBI Thriller (FBI Thriller (... Catherine Coulter 236283 0.0
49513 0515135739 Eleventh Hour: An FBI Thriller (FBI Thriller (... Catherine Coulter 251613 0.0
49514 0515135739 Eleventh Hour: An FBI Thriller (FBI Thriller (... Catherine Coulter 252071 0.0
49515 0515135739 Eleventh Hour: An FBI Thriller (FBI Thriller (... Catherine Coulter 256407 0.0
49516 0515135739 Eleventh Hour: An FBI Thriller (FBI Thriller (... Catherine Coulter 262399 0.0

49517 rows × 5 columns

df_books_ratings.drop_duplicates(["title", "user"], keep="first", inplace=True)

pivot = df_books_ratings.pivot(index="title", columns="user", values="rating").fillna(0)

pivot.shape
(673, 888)
pivot
user 254 2276 2766 2977 3363 4017 4385 6242 6251 6323 ... 274004 274061 274301 274308 274808 275970 277427 277478 277639 278418
title
1984 9.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1st to Die: A Novel 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2nd Chance 0.0 10.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 Blondes 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
A Beautiful Mind: The Life of Mathematical Genius and Nobel Laureate John Nash 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Without Remorse 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Year of Wonders 0.0 0.0 0.0 7.0 0.0 0.0 0.0 7.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
You Belong To Me 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
\O\" Is for Outlaw" 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 8.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

673 rows × 888 columns

matrix = csr_matrix(pivot.values)

matrix
<Compressed Sparse Row sparse matrix of dtype 'float32'
    with 12423 stored elements and shape (673, 888)>
model = NearestNeighbors(algorithm="brute", metric="cosine")
model.fit(matrix)
NearestNeighbors(algorithm='brute', metric='cosine')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
# function to return recommended books - this will be tested

# Function adapted from
# https://datascienceplus.com/building-a-book-recommender-system-the-basics-knn-and-matrix-factorization/
def get_recommends(book=""):
    distances, indices = model.kneighbors(
        pivot.loc[book, :].values.reshape(1, -1), n_neighbors=5
    )
    recommended_books = []
    recommended_books.append(book)
    recommended_books.append([])
    for i in range(1, len(distances.flatten())):
        recommended_books[1].insert(
            0, [pivot.index[indices.flatten()[i]], distances.flatten()[i]]
        )

    return recommended_books
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
import pprint

pprint.pprint(books)


def test_book_recommendation():
    test_pass = True
    recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
    if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
        test_pass = False
    recommended_books = [
        "I'll Be Seeing You",
        "The Weight of Water",
        "The Surgeon",
        "I Know This Much Is True",
    ]
    recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
    for i in range(2):
        if recommends[1][i][0] not in recommended_books:
            test_pass = False
        if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
            test_pass = False
    if test_pass:
        print("You passed the challenge! 🎉🎉🎉🎉🎉")
    else:
        print("You haven't passed yet. Keep trying!")


test_book_recommendation()
["Where the Heart Is (Oprah's Book Club (Paperback))",
 [['The Weight of Water', np.float32(0.77085835)],
  ['The Surgeon', np.float32(0.7699411)],
  ['I Know This Much Is True', np.float32(0.7677075)],
  ['The Lovely Bones: A Novel', np.float32(0.7234864)]]]
You passed the challenge! 🎉🎉🎉🎉🎉