## Warning: package 'tidyr' was built under R version 3.6.2
## Warning: package 'purrr' was built under R version 3.6.2
# Glimpse of original datset
glimpse(books_data)
## Observations: 13,724
## Variables: 10
## $ bookID <int> 1, 2, 3, 4, 5, 8, 9, 10, 12, 13, 14, 16, 18, …
## $ title <chr> "Harry Potter and the Half-Blood Prince (Harr…
## $ authors <chr> "J.K. Rowling-Mary GrandPré", "J.K. Rowling-M…
## $ average_rating <chr> "4.56", "4.49", "4.47", "4.41", "4.55", "4.78…
## $ isbn <chr> "0439785960", "0439358078", "0439554934", "04…
## $ isbn13 <chr> "9780439785969", "9780439358071", "9780439554…
## $ language_code <chr> "eng", "eng", "eng", "eng", "eng", "eng", "en…
## $ X..num_pages <chr> "652", "870", "320", "352", "435", "2690", "1…
## $ ratings_count <int> 1944099, 1996446, 5629932, 6267, 2149872, 388…
## $ text_reviews_count <int> 26249, 27613, 70390, 272, 33964, 154, 1, 820,…
# Changing "X..num_pages" column name to "number_of_pages"
colnames(books_data)[colnames(books_data)=="X..num_pages"] <- "number_of_pages"
# Changing number of pages to number type
books_data$number_of_pages <- as.integer(books_data$number_of_pages)
## Warning: NAs introduced by coercion
# Looking at summary of missing values
missing <- is.na(books_data)
summary(missing)
## bookID title authors average_rating
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:13724 FALSE:13724 FALSE:13724 FALSE:13724
##
## isbn isbn13 language_code number_of_pages
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:13724 FALSE:13724 FALSE:13724 FALSE:13714
## TRUE :10
## ratings_count text_reviews_count
## Mode :logical Mode :logical
## FALSE:13719 FALSE:13719
## TRUE :5 TRUE :5
# Total number of missing values
sum(is.na(books_data))
## [1] 20
# Removing missing values
books_data_no_na <- na.omit(books_data)
# Checking new dataset to confirm missing values have been removed
sum(is.na(books_data_no_na))
## [1] 0
# Replacing all numbers in language code column with empty string
books_data_no_na$language_code <- gsub("[0-9]+", "", books_data_no_na$language_code)
# Removing observations from language code column that are only empty strings
books_data_no_na <- books_data_no_na[books_data_no_na$language_code!="",]
books_data_new = subset(books_data_no_na, select = -c(isbn, isbn13))
# Preview of cleaned dataset
head(books_data_new, n = 10)
| bookID | title | authors | average_rating | language_code | number_of_pages | ratings_count | text_reviews_count |
|---|---|---|---|---|---|---|---|
| 1 | Harry Potter and the Half-Blood Prince (Harry Potter #6) | J.K. Rowling-Mary GrandPré | 4.56 | eng | 652 | 1944099 | 26249 |
| 2 | Harry Potter and the Order of the Phoenix (Harry Potter #5) | J.K. Rowling-Mary GrandPré | 4.49 | eng | 870 | 1996446 | 27613 |
| 3 | Harry Potter and the Sorcerer’s Stone (Harry Potter #1) | J.K. Rowling-Mary GrandPré | 4.47 | eng | 320 | 5629932 | 70390 |
| 4 | Harry Potter and the Chamber of Secrets (Harry Potter #2) | J.K. Rowling | 4.41 | eng | 352 | 6267 | 272 |
| 5 | Harry Potter and the Prisoner of Azkaban (Harry Potter #3) | J.K. Rowling-Mary GrandPré | 4.55 | eng | 435 | 2149872 | 33964 |
| 8 | Harry Potter Boxed Set Books 1-5 (Harry Potter #1-5) | J.K. Rowling-Mary GrandPré | 4.78 | eng | 2690 | 38872 | 154 |
| 9 | Unauthorized Harry Potter Book Seven News: Half-Blood Prince Analysis and Speculation | W. Frederick Zimmerman | 3.69 | en-US | 152 | 18 | 1 |
| 10 | Harry Potter Collection (Harry Potter #1-6) | J.K. Rowling | 4.73 | eng | 3342 | 27410 | 820 |
| 12 | The Ultimate Hitchhiker’s Guide: Five Complete Novels and One Story (Hitchhiker’s Guide to the Galaxy #1-5) | Douglas Adams | 4.38 | eng | 815 | 3602 | 258 |
| 13 | The Ultimate Hitchhiker’s Guide to the Galaxy | Douglas Adams | 4.38 | eng | 815 | 240189 | 3954 |
# Filtering data to page numbers less than 1,000
books_data_new <-
books_data_new %>%
filter(number_of_pages < 1000)
# Scatterplot to explore relationship between number of pages and average rating
ggplot(books_data_new, aes(x = number_of_pages, y = as.numeric(average_rating))) +
geom_point(alpha = 0.5, color = 'darkgreen') +
ggtitle("Goodreads Number of Pages vs. Average Rating") +
xlab("Number of Pages") + ylab("Average Rating")
# Histogram to show distribution of language codes
ggplot(data = books_data_new) +
geom_bar(mapping = aes(x = language_code), fill = 'purple') +
scale_x_discrete(labels = c("en-US" = "US", "en-GB" = "GB", "en-CA" = "CA")) +
ggtitle("Goodreads Language Codes") +
xlab("Language Code") + ylab("Count")
# Filtering data to only English language books
books_data_new.eng <-
books_data_new %>%
filter(language_code=="eng")
# Scatterplot to explore relationship between number of pages and average rating
ggplot(books_data_new.eng, aes(x = number_of_pages, y = as.numeric(average_rating), color = ratings_count)) +
geom_point(alpha = 0.5) +
ggtitle("Goodreads Number of Pages vs. Average Rating (English Language)") +
xlab("Number of Pages") + ylab("Average Rating") + labs(color = "Ratings Count") +
scale_color_continuous(labels = comma)
# Filtering data to books with less than 500 reviews
books_data_new.eng.f <-
books_data_new.eng %>%
filter(text_reviews_count < 500)
# Scatterplot for number of reviews vs. average rating
ggplot(books_data_new.eng.f, aes(x = text_reviews_count, y = as.numeric(average_rating), color = factor(multiple_authors))) +
geom_point(alpha = 0.5) +
ggtitle("Goodreads Number of Reviews vs. Average Rating (English Language)") +
xlab("Number of Reviews") + ylab("Average Rating") + labs(color = "Multiple Authors")
# Correlation between reviews and ratings
cor(books_data_new.eng$text_reviews_count, as.numeric(books_data_new.eng$average_rating))
## [1] 0.04413932
# Selecting variables from dataset
books_data_new.eng.1 <- books_data_new.eng[, c("average_rating", "number_of_pages", "ratings_count", "text_reviews_count")]
# Multiple Regression model predicting average rating
books_data_new.eng.1_mod <- lm(as.numeric(average_rating) ~ number_of_pages + ratings_count + text_reviews_count, data = books_data_new.eng)
summary(books_data_new.eng.1_mod)
##
## Call:
## lm(formula = as.numeric(average_rating) ~ number_of_pages + ratings_count +
## text_reviews_count, data = books_data_new.eng)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0525 -0.1517 0.0250 0.1990 1.1616
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.838e+00 7.083e-03 541.848 <2e-16 ***
## number_of_pages 2.586e-04 1.932e-05 13.386 <2e-16 ***
## ratings_count 1.104e-07 5.463e-08 2.020 0.0434 *
## text_reviews_count 3.356e-07 2.451e-06 0.137 0.8911
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3612 on 10357 degrees of freedom
## Multiple R-squared: 0.01925, Adjusted R-squared: 0.01896
## F-statistic: 67.75 on 3 and 10357 DF, p-value: < 2.2e-16