import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import requests

Downloading the dataset

url = "https://cdn.freecodecamp.org/project-data/books/book-crossings.zip"
query_parameters = {"downloadformat": "zip"}

response = requests.get(url, params=query_parameters)

with open("book-crossings.zip", mode="wb") as file:
    file.write(response.content)

import zipfile

with zipfile.ZipFile("book-crossings.zip", mode="r") as archive:
    for file in archive.namelist():
        archive.extractall("book-crossings/")

books_filename = "book-crossings/BX-Books.csv"
ratings_filename = "book-crossings/BX-Book-Ratings.csv"

df_books = pd.read_csv(
    books_filename,
    encoding="ISO-8859-1",
    sep=";",
    header=0,
    names=["isbn", "title", "author"],
    usecols=["isbn", "title", "author"],
    dtype={"isbn": "str", "title": "str", "author": "str"},
)

df_ratings = pd.read_csv(
    ratings_filename,
    encoding="ISO-8859-1",
    sep=";",
    header=0,
    names=["user", "isbn", "rating"],
    usecols=["user", "isbn", "rating"],
    dtype={"user": "int32", "isbn": "str", "rating": "float32"},
)

Exploring the dataset

df_books

	isbn	title	author
0	0195153448	Classical Mythology	Mark P. O. Morford
1	0002005018	Clara Callan	Richard Bruce Wright
2	0060973129	Decision in Normandy	Carlo D'Este
3	0374157065	Flu: The Story of the Great Influenza Pandemic...	Gina Bari Kolata
4	0393045218	The Mummies of Urumchi	E. J. W. Barber
...	...	...	...
271374	0440400988	There's a Bat in Bunk Five	Paula Danziger
271375	0525447644	From One to One Hundred	Teri Sloat
271376	006008667X	Lily Dale : The True Story of the Town that Ta...	Christine Wicker
271377	0192126040	Republic (World's Classics)	Plato
271378	0767409752	A Guided Tour of Rene Descartes' Meditations o...	Christopher Biffle

271379 rows × 3 columns

df_ratings

	user	isbn	rating
0	276725	034545104X	0.0
1	276726	0155061224	5.0
2	276727	0446520802	0.0
3	276729	052165615X	3.0
4	276729	0521795028	6.0
...	...	...	...
1149775	276704	1563526298	9.0
1149776	276706	0679447156	0.0
1149777	276709	0515107662	10.0
1149778	276721	0590442449	10.0
1149779	276723	05162443314	8.0

1149780 rows × 3 columns

# Remove from the dataset users with less than 200 ratings and books with less than 100 ratings

# Function adapted from
# https://datascienceplus.com/building-a-book-recommender-system-the-basics-knn-and-matrix-factorization/
counts_user = df_ratings["user"].value_counts()
counts_isbn = df_ratings["isbn"].value_counts()
counts_user = counts_user[counts_user >= 200].index
counts_isbn = counts_isbn[counts_isbn >= 100].index
df_ratings_filtered = df_ratings.loc[
    (df_ratings["user"].isin(counts_user.values))
    & (df_ratings["isbn"].isin(counts_isbn.values))
]

df_ratings_filtered.shape

(49781, 3)

Joining the two dataframes

df_books_ratings = pd.merge(df_books, df_ratings_filtered, on="isbn")

df_books_ratings["author"] = df_books_ratings["author"].str.title()

df_books_ratings

	isbn	title	author	user	rating
0	0440234743	The Testament	John Grisham	277478	0.0
1	0440234743	The Testament	John Grisham	2977	0.0
2	0440234743	The Testament	John Grisham	3363	0.0
3	0440234743	The Testament	John Grisham	7346	9.0
4	0440234743	The Testament	John Grisham	9856	0.0
...	...	...	...	...	...
49512	0515135739	Eleventh Hour: An FBI Thriller (FBI Thriller (...	Catherine Coulter	236283	0.0
49513	0515135739	Eleventh Hour: An FBI Thriller (FBI Thriller (...	Catherine Coulter	251613	0.0
49514	0515135739	Eleventh Hour: An FBI Thriller (FBI Thriller (...	Catherine Coulter	252071	0.0
49515	0515135739	Eleventh Hour: An FBI Thriller (FBI Thriller (...	Catherine Coulter	256407	0.0
49516	0515135739	Eleventh Hour: An FBI Thriller (FBI Thriller (...	Catherine Coulter	262399	0.0

49517 rows × 5 columns

Dropping duplicates and making a pivot table for the KNN algorithm

df_books_ratings.drop_duplicates(["title", "user"], keep="first", inplace=True)

pivot = df_books_ratings.pivot(index="title", columns="user", values="rating").fillna(0)

pivot.shape

(673, 888)

pivot

user	254	2276	2766	2977	3363	4017	4385	6242	6251	6323	...	274004	274061	274301	274308	274808	275970	277427	277478	277639	278418
title
1984	9.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1st to Die: A Novel	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2nd Chance	0.0	10.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4 Blondes	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
A Beautiful Mind: The Life of Mathematical Genius and Nobel Laureate John Nash	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
Without Remorse	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
Year of Wonders	0.0	0.0	0.0	7.0	0.0	0.0	0.0	7.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
You Belong To Me	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
\O\" Is for Outlaw"	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	8.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

673 rows × 888 columns

Converting the pivot table above to a sparse matrix that is efficient for arithmetic operations, row slicing, and matrix-vector products. It is particularly useful when dealing with large matrices where most elements are zero, as it saves memory by only storing non-zero elements.

matrix = csr_matrix(pivot.values)

matrix

<Compressed Sparse Row sparse matrix of dtype 'float32'
    with 12423 stored elements and shape (673, 888)>

model = NearestNeighbors(algorithm="brute", metric="cosine")

model.fit(matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Getting the results

# Function to return recommended books
# adapted from
# https://datascienceplus.com/building-a-book-recommender-system-the-basics-knn-and-matrix-factorization/
def get_recommends(book=""):
    distances, indices = model.kneighbors(
        pivot.loc[book, :].values.reshape(1, -1), n_neighbors=5
    )
    recommended_books = []
    recommended_books.append(book)
    recommended_books.append([])
    for i in range(1, len(distances.flatten())):
        recommended_books[1].insert(
            0, [pivot.index[indices.flatten()[i]], distances.flatten()[i]]
        )

    return recommended_books

import pprint

recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")

pprint.pprint(recommends)

["Where the Heart Is (Oprah's Book Club (Paperback))",
 [['The Weight of Water', np.float32(0.77085835)],
  ['The Surgeon', np.float32(0.7699411)],
  ['I Know This Much Is True', np.float32(0.7677075)],
  ['The Lovely Bones: A Novel', np.float32(0.7234864)]]]

Testing the results

def test_book_recommendation():
    test_pass = True

    if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
        test_pass = False

    recommended_books = [
        "I'll Be Seeing You",
        "The Weight of Water",
        "The Surgeon",
        "I Know This Much Is True",
    ]
    recommended_books_dist = [0.8, 0.77, 0.77, 0.77]

    for i in range(2):
        if recommends[1][i][0] not in recommended_books:
            test_pass = False
        if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
            test_pass = False
    if test_pass:
        print("You passed the challenge! 🎉🎉🎉🎉🎉")
    else:
        print("You haven't passed yet. Keep trying!")


test_book_recommendation()

You passed the challenge! 🎉🎉🎉🎉🎉