Book Recommendation Model

Published

January 1, 2024

import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import requests

Downloading the dataset

url = "https://cdn.freecodecamp.org/project-data/books/book-crossings.zip"
query_parameters = {"downloadformat": "zip"}

response = requests.get(url, params=query_parameters)

with open("book-crossings.zip", mode="wb") as file:
    file.write(response.content)
import zipfile

with zipfile.ZipFile("book-crossings.zip", mode="r") as archive:
    for file in archive.namelist():
        archive.extractall("book-crossings/")
books_filename = "book-crossings/BX-Books.csv"
ratings_filename = "book-crossings/BX-Book-Ratings.csv"
df_books = pd.read_csv(
    books_filename,
    encoding="ISO-8859-1",
    sep=";",
    header=0,
    names=["isbn", "title", "author"],
    usecols=["isbn", "title", "author"],
    dtype={"isbn": "str", "title": "str", "author": "str"},
)

df_ratings = pd.read_csv(
    ratings_filename,
    encoding="ISO-8859-1",
    sep=";",
    header=0,
    names=["user", "isbn", "rating"],
    usecols=["user", "isbn", "rating"],
    dtype={"user": "int32", "isbn": "str", "rating": "float32"},
)

Exploring the dataset

df_books
isbn title author
0 0195153448 Classical Mythology Mark P. O. Morford
1 0002005018 Clara Callan Richard Bruce Wright
2 0060973129 Decision in Normandy Carlo D'Este
3 0374157065 Flu: The Story of the Great Influenza Pandemic... Gina Bari Kolata
4 0393045218 The Mummies of Urumchi E. J. W. Barber
... ... ... ...
271374 0440400988 There's a Bat in Bunk Five Paula Danziger
271375 0525447644 From One to One Hundred Teri Sloat
271376 006008667X Lily Dale : The True Story of the Town that Ta... Christine Wicker
271377 0192126040 Republic (World's Classics) Plato
271378 0767409752 A Guided Tour of Rene Descartes' Meditations o... Christopher Biffle

271379 rows × 3 columns

df_ratings
user isbn rating
0 276725 034545104X 0.0
1 276726 0155061224 5.0
2 276727 0446520802 0.0
3 276729 052165615X 3.0
4 276729 0521795028 6.0
... ... ... ...
1149775 276704 1563526298 9.0
1149776 276706 0679447156 0.0
1149777 276709 0515107662 10.0
1149778 276721 0590442449 10.0
1149779 276723 05162443314 8.0

1149780 rows × 3 columns

# Remove from the dataset users with less than 200 ratings and books with less than 100 ratings

# Function adapted from
# https://datascienceplus.com/building-a-book-recommender-system-the-basics-knn-and-matrix-factorization/
counts_user = df_ratings["user"].value_counts()
counts_isbn = df_ratings["isbn"].value_counts()
counts_user = counts_user[counts_user >= 200].index
counts_isbn = counts_isbn[counts_isbn >= 100].index
df_ratings_filtered = df_ratings.loc[
    (df_ratings["user"].isin(counts_user.values))
    & (df_ratings["isbn"].isin(counts_isbn.values))
]

df_ratings_filtered.shape
(49781, 3)

Joining the two dataframes

df_books_ratings = pd.merge(df_books, df_ratings_filtered, on="isbn")

df_books_ratings["author"] = df_books_ratings["author"].str.title()

df_books_ratings
isbn title author user rating
0 0440234743 The Testament John Grisham 277478 0.0
1 0440234743 The Testament John Grisham 2977 0.0
2 0440234743 The Testament John Grisham 3363 0.0
3 0440234743 The Testament John Grisham 7346 9.0
4 0440234743 The Testament John Grisham 9856 0.0
... ... ... ... ... ...
49512 0515135739 Eleventh Hour: An FBI Thriller (FBI Thriller (... Catherine Coulter 236283 0.0
49513 0515135739 Eleventh Hour: An FBI Thriller (FBI Thriller (... Catherine Coulter 251613 0.0
49514 0515135739 Eleventh Hour: An FBI Thriller (FBI Thriller (... Catherine Coulter 252071 0.0
49515 0515135739 Eleventh Hour: An FBI Thriller (FBI Thriller (... Catherine Coulter 256407 0.0
49516 0515135739 Eleventh Hour: An FBI Thriller (FBI Thriller (... Catherine Coulter 262399 0.0

49517 rows × 5 columns

Dropping duplicates and making a pivot table for the KNN algorithm

df_books_ratings.drop_duplicates(["title", "user"], keep="first", inplace=True)

pivot = df_books_ratings.pivot(index="title", columns="user", values="rating").fillna(0)

pivot.shape
(673, 888)
pivot
user 254 2276 2766 2977 3363 4017 4385 6242 6251 6323 ... 274004 274061 274301 274308 274808 275970 277427 277478 277639 278418
title
1984 9.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1st to Die: A Novel 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2nd Chance 0.0 10.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 Blondes 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
A Beautiful Mind: The Life of Mathematical Genius and Nobel Laureate John Nash 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Without Remorse 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Year of Wonders 0.0 0.0 0.0 7.0 0.0 0.0 0.0 7.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
You Belong To Me 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
\O\" Is for Outlaw" 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 8.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

673 rows × 888 columns

Converting the pivot table above to a sparse matrix that is efficient for arithmetic operations, row slicing, and matrix-vector products. It is particularly useful when dealing with large matrices where most elements are zero, as it saves memory by only storing non-zero elements.

matrix = csr_matrix(pivot.values)

matrix
<Compressed Sparse Row sparse matrix of dtype 'float32'
    with 12423 stored elements and shape (673, 888)>
model = NearestNeighbors(algorithm="brute", metric="cosine")

model.fit(matrix)
NearestNeighbors(algorithm='brute', metric='cosine')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Getting the results

# Function to return recommended books
# adapted from
# https://datascienceplus.com/building-a-book-recommender-system-the-basics-knn-and-matrix-factorization/
def get_recommends(book=""):
    distances, indices = model.kneighbors(
        pivot.loc[book, :].values.reshape(1, -1), n_neighbors=5
    )
    recommended_books = []
    recommended_books.append(book)
    recommended_books.append([])
    for i in range(1, len(distances.flatten())):
        recommended_books[1].insert(
            0, [pivot.index[indices.flatten()[i]], distances.flatten()[i]]
        )

    return recommended_books
import pprint

recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")

pprint.pprint(recommends)
["Where the Heart Is (Oprah's Book Club (Paperback))",
 [['The Weight of Water', np.float32(0.77085835)],
  ['The Surgeon', np.float32(0.7699411)],
  ['I Know This Much Is True', np.float32(0.7677075)],
  ['The Lovely Bones: A Novel', np.float32(0.7234864)]]]

Testing the results

def test_book_recommendation():
    test_pass = True

    if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
        test_pass = False

    recommended_books = [
        "I'll Be Seeing You",
        "The Weight of Water",
        "The Surgeon",
        "I Know This Much Is True",
    ]
    recommended_books_dist = [0.8, 0.77, 0.77, 0.77]

    for i in range(2):
        if recommends[1][i][0] not in recommended_books:
            test_pass = False
        if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
            test_pass = False
    if test_pass:
        print("You passed the challenge! 🎉🎉🎉🎉🎉")
    else:
        print("You haven't passed yet. Keep trying!")


test_book_recommendation()
You passed the challenge! 🎉🎉🎉🎉🎉