import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import requestsDownloading the dataset
url = "https://cdn.freecodecamp.org/project-data/books/book-crossings.zip"
query_parameters = {"downloadformat": "zip"}
response = requests.get(url, params=query_parameters)
with open("book-crossings.zip", mode="wb") as file:
file.write(response.content)import zipfile
with zipfile.ZipFile("book-crossings.zip", mode="r") as archive:
for file in archive.namelist():
archive.extractall("book-crossings/")books_filename = "book-crossings/BX-Books.csv"
ratings_filename = "book-crossings/BX-Book-Ratings.csv"df_books = pd.read_csv(
books_filename,
encoding="ISO-8859-1",
sep=";",
header=0,
names=["isbn", "title", "author"],
usecols=["isbn", "title", "author"],
dtype={"isbn": "str", "title": "str", "author": "str"},
)
df_ratings = pd.read_csv(
ratings_filename,
encoding="ISO-8859-1",
sep=";",
header=0,
names=["user", "isbn", "rating"],
usecols=["user", "isbn", "rating"],
dtype={"user": "int32", "isbn": "str", "rating": "float32"},
)Exploring the dataset
df_books| isbn | title | author | |
|---|---|---|---|
| 0 | 0195153448 | Classical Mythology | Mark P. O. Morford |
| 1 | 0002005018 | Clara Callan | Richard Bruce Wright |
| 2 | 0060973129 | Decision in Normandy | Carlo D'Este |
| 3 | 0374157065 | Flu: The Story of the Great Influenza Pandemic... | Gina Bari Kolata |
| 4 | 0393045218 | The Mummies of Urumchi | E. J. W. Barber |
| ... | ... | ... | ... |
| 271374 | 0440400988 | There's a Bat in Bunk Five | Paula Danziger |
| 271375 | 0525447644 | From One to One Hundred | Teri Sloat |
| 271376 | 006008667X | Lily Dale : The True Story of the Town that Ta... | Christine Wicker |
| 271377 | 0192126040 | Republic (World's Classics) | Plato |
| 271378 | 0767409752 | A Guided Tour of Rene Descartes' Meditations o... | Christopher Biffle |
271379 rows × 3 columns
df_ratings| user | isbn | rating | |
|---|---|---|---|
| 0 | 276725 | 034545104X | 0.0 |
| 1 | 276726 | 0155061224 | 5.0 |
| 2 | 276727 | 0446520802 | 0.0 |
| 3 | 276729 | 052165615X | 3.0 |
| 4 | 276729 | 0521795028 | 6.0 |
| ... | ... | ... | ... |
| 1149775 | 276704 | 1563526298 | 9.0 |
| 1149776 | 276706 | 0679447156 | 0.0 |
| 1149777 | 276709 | 0515107662 | 10.0 |
| 1149778 | 276721 | 0590442449 | 10.0 |
| 1149779 | 276723 | 05162443314 | 8.0 |
1149780 rows × 3 columns
# Remove from the dataset users with less than 200 ratings and books with less than 100 ratings
# Function adapted from
# https://datascienceplus.com/building-a-book-recommender-system-the-basics-knn-and-matrix-factorization/
counts_user = df_ratings["user"].value_counts()
counts_isbn = df_ratings["isbn"].value_counts()
counts_user = counts_user[counts_user >= 200].index
counts_isbn = counts_isbn[counts_isbn >= 100].index
df_ratings_filtered = df_ratings.loc[
(df_ratings["user"].isin(counts_user.values))
& (df_ratings["isbn"].isin(counts_isbn.values))
]
df_ratings_filtered.shape(49781, 3)
Joining the two dataframes
df_books_ratings = pd.merge(df_books, df_ratings_filtered, on="isbn")
df_books_ratings["author"] = df_books_ratings["author"].str.title()
df_books_ratings| isbn | title | author | user | rating | |
|---|---|---|---|---|---|
| 0 | 0440234743 | The Testament | John Grisham | 277478 | 0.0 |
| 1 | 0440234743 | The Testament | John Grisham | 2977 | 0.0 |
| 2 | 0440234743 | The Testament | John Grisham | 3363 | 0.0 |
| 3 | 0440234743 | The Testament | John Grisham | 7346 | 9.0 |
| 4 | 0440234743 | The Testament | John Grisham | 9856 | 0.0 |
| ... | ... | ... | ... | ... | ... |
| 49512 | 0515135739 | Eleventh Hour: An FBI Thriller (FBI Thriller (... | Catherine Coulter | 236283 | 0.0 |
| 49513 | 0515135739 | Eleventh Hour: An FBI Thriller (FBI Thriller (... | Catherine Coulter | 251613 | 0.0 |
| 49514 | 0515135739 | Eleventh Hour: An FBI Thriller (FBI Thriller (... | Catherine Coulter | 252071 | 0.0 |
| 49515 | 0515135739 | Eleventh Hour: An FBI Thriller (FBI Thriller (... | Catherine Coulter | 256407 | 0.0 |
| 49516 | 0515135739 | Eleventh Hour: An FBI Thriller (FBI Thriller (... | Catherine Coulter | 262399 | 0.0 |
49517 rows × 5 columns
Dropping duplicates and making a pivot table for the KNN algorithm
df_books_ratings.drop_duplicates(["title", "user"], keep="first", inplace=True)
pivot = df_books_ratings.pivot(index="title", columns="user", values="rating").fillna(0)
pivot.shape(673, 888)
pivot| user | 254 | 2276 | 2766 | 2977 | 3363 | 4017 | 4385 | 6242 | 6251 | 6323 | ... | 274004 | 274061 | 274301 | 274308 | 274808 | 275970 | 277427 | 277478 | 277639 | 278418 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| title | |||||||||||||||||||||
| 1984 | 9.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1st to Die: A Novel | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2nd Chance | 0.0 | 10.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 Blondes | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| A Beautiful Mind: The Life of Mathematical Genius and Nobel Laureate John Nash | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| Without Remorse | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| Year of Wonders | 0.0 | 0.0 | 0.0 | 7.0 | 0.0 | 0.0 | 0.0 | 7.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| You Belong To Me | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| Zen and the Art of Motorcycle Maintenance: An Inquiry into Values | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| \O\" Is for Outlaw" | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 8.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
673 rows × 888 columns
Converting the pivot table above to a sparse matrix that is efficient for arithmetic operations, row slicing, and matrix-vector products. It is particularly useful when dealing with large matrices where most elements are zero, as it saves memory by only storing non-zero elements.
matrix = csr_matrix(pivot.values)
matrix<Compressed Sparse Row sparse matrix of dtype 'float32'
with 12423 stored elements and shape (673, 888)>
model = NearestNeighbors(algorithm="brute", metric="cosine")
model.fit(matrix)NearestNeighbors(algorithm='brute', metric='cosine')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
NearestNeighbors(algorithm='brute', metric='cosine')
Getting the results
# Function to return recommended books
# adapted from
# https://datascienceplus.com/building-a-book-recommender-system-the-basics-knn-and-matrix-factorization/
def get_recommends(book=""):
distances, indices = model.kneighbors(
pivot.loc[book, :].values.reshape(1, -1), n_neighbors=5
)
recommended_books = []
recommended_books.append(book)
recommended_books.append([])
for i in range(1, len(distances.flatten())):
recommended_books[1].insert(
0, [pivot.index[indices.flatten()[i]], distances.flatten()[i]]
)
return recommended_booksimport pprint
recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
pprint.pprint(recommends)["Where the Heart Is (Oprah's Book Club (Paperback))",
[['The Weight of Water', np.float32(0.77085835)],
['The Surgeon', np.float32(0.7699411)],
['I Know This Much Is True', np.float32(0.7677075)],
['The Lovely Bones: A Novel', np.float32(0.7234864)]]]
Testing the results
def test_book_recommendation():
test_pass = True
if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
test_pass = False
recommended_books = [
"I'll Be Seeing You",
"The Weight of Water",
"The Surgeon",
"I Know This Much Is True",
]
recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
for i in range(2):
if recommends[1][i][0] not in recommended_books:
test_pass = False
if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
test_pass = False
if test_pass:
print("You passed the challenge! 🎉🎉🎉🎉🎉")
else:
print("You haven't passed yet. Keep trying!")
test_book_recommendation()You passed the challenge! 🎉🎉🎉🎉🎉