Tidytuesday Avatar 2020-08-12
library(tidytuesdayR)
library(tidyverse)
library(tidymodels)
library(stringr)
Getting data
avatar <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-08-11/avatar.csv')
scene_description <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-08-11/scene_description.csv')
Data exploration
avatar_summary_chapter<-
avatar %>% group_by(book_num,chapter_num) %>%
summarize(mean_rating = mean(imdb_rating, na.rm = TRUE)) %>%
as_tibble() %>%
mutate_if(is.character,as.factor)
avatar_summary_chapter %>% group_by(book_num) %>%
summarize(mean_rating = mean(mean_rating, na.rm = TRUE)) %>%
ggplot(aes(book_num,mean_rating)) +
geom_bar(stat='identity')+
theme(legend.position='none')+
labs(title = "Imdb rating acroos the three books",
y = "Mean imdb rating", x = "Book number")
avatar_summary_character<-avatar %>% group_by(character,book_num) %>%
summarize(mean_rating = mean(imdb_rating)) %>%
as_tibble() %>%
mutate_if(is.character,as.factor)
avatar_summary_character %>%
ggplot(aes(book_num,mean_rating,group=character))+
geom_line(aes(group = character)) +
geom_point(aes(color=character))+
theme(legend.position='none')+
labs(title = "Imdb rating associated to each character",
y = "Mean imdb rating", x = "Book number")
avatar_summary_writer<-avatar %>% group_by(writer,book_num) %>%
summarize(mean_rating = mean(imdb_rating,na.rm=TRUE)) %>%
as_tibble()
# mutate_if(is.character,as.factor)
avatar_summary_writer$writer <-avatar_summary_writer$writer %>% str_trunc(.,30,"right")
avatar_summary_writer<-avatar_summary_writer %>% mutate_if(is.character,as.factor)
avatar_summary_writer %>%
ggplot(aes(book_num,mean_rating))+
geom_line(aes(group = writer, color=writer)) +
geom_point(aes( color=writer))+
theme(legend.position='top')+
labs(title = "Imdb rating associated to each writer",
y = "Mean imdb rating", x = "Book number")
avatar_summary_director<-avatar %>% group_by(director,book_num) %>%
summarize(mean_rating = mean(imdb_rating,na.rm=TRUE)) %>%
as_tibble() %>%
mutate_if(is.character,as.factor)
avatar_summary_director %>%
ggplot(aes(book_num,mean_rating))+
geom_line(aes(group = director, color=director)) +
geom_point(aes( color=director))+
theme(legend.position='top')+
labs(title = "Imdb rating associated to each director",
y = "Mean imdb rating", x = "Book number")
avatar_summary_director_writer<-avatar %>% group_by(writer,director,book_num) %>%
summarize(mean_rating = mean(imdb_rating,na.rm=TRUE)) %>%
as_tibble() %>%
mutate(combination=paste(writer,director,sep="."))
avatar_summary_director_writer$combination <-avatar_summary_director_writer$combination %>% str_trunc(.,30,"right")
avatar_summary_director_writer<-avatar_summary_director_writer %>% mutate_if(is.character,as.factor)
#mutate_if(is.character,as.factor)
avatar_summary_director_writer %>%
drop_na() %>%
ggplot(aes(mean_rating,combination))+
geom_bar(position="dodge", stat="identity")+
theme(legend.position='top')+
labs(title = "Imdb rating associated to writer-director combos",
x = "Mean imdb rating", y = "")
It looks like episodes associated to later books got a slightly better Imdb rating. Some characters tend to be associated with better Imdb rating. Also it seems that episodes based on chapters written by some authors get very different Imdb rating. For instance episodes based on Book 1 chapters written by some authors got low Imdb rating and those authors did not participate writing chapters in Book 2 and Book 3. It looks like some star author(s) were hired for the last book, and those chapters got highest Imdb rating. Finally, some directors that directed episodes based on Book 2 and 3 got pretty high rating, and it looks like those directors were hired for one season only.
Some writers may be writing more about popular characters and likewise some episodes directed by some directors may have higher presence of some popular characters, which may be pushing higher Imdb rates these artist may eventually receive. Maybe a combination of best writers and directors gives the top rates.
Next question we have is what parameters are driving Imdb rating?
avatar_simple<-avatar %>%
select(character,book_num,chapter_num,writer,director,imdb_rating) %>%
dplyr::mutate(book_num = replace_na(book_num, 0),
chapter_num = replace_na(chapter_num, 0),
imdb_rating= replace_na(imdb_rating, 0),
character= replace_na(character, 0),
) %>%
dplyr::filter(imdb_rating!=0) %>%
mutate(imdb_rating = case_when(
imdb_rating > 8.75 ~ "high",
imdb_rating < 8.75 ~ "low")) %>% unique()%>%
na.omit() %>%
mutate_if(is.character, factor) %>%
janitor::clean_names() %>%
mutate(character = fct_lump(character,100))
avatar_simple %>% count(imdb_rating)
## # A tibble: 2 x 2
## imdb_rating n
## <fct> <int>
## 1 high 474
## 2 low 475
We can consider an Imdb rating as low if this is <8.75 and high if Imdb rating is >8.75
Building models with recipes
avatar_split <- initial_split(avatar_simple, strata = imdb_rating)
avatar_train <- training(avatar_split)
avatar_test <- testing(avatar_split)
avatar_rec <- recipe(imdb_rating ~ ., data = avatar_train) %>%
# update_role(id, new_role = "ID") %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_zv(all_numeric()) %>%
step_normalize(all_numeric())
avatar_prep <- avatar_rec %>%prep()
avatar_juiced <- juice(avatar_prep)
glm_spec <- logistic_reg() %>%
set_engine("glm")
glm_fit <- glm_spec %>%
fit(imdb_rating ~ ., data = avatar_juiced)
knn_spec <- nearest_neighbor() %>%
set_engine("kknn") %>%
set_mode("classification")
knn_fit <- knn_spec %>%
fit(imdb_rating ~ ., data = avatar_juiced)
tree_spec <- decision_tree() %>%
set_engine("rpart") %>%
set_mode("classification")
tree_fit <- tree_spec %>%
fit(imdb_rating ~ ., data = avatar_juiced)
We can now evaluate the models with resampling
set.seed(2020)
folds <- vfold_cv(avatar_train, strata = imdb_rating)
We can estimate the models metrics
set.seed(234)
glm_rs <- glm_spec %>%
fit_resamples(
avatar_rec,
folds,
metrics = metric_set(roc_auc, sens, spec),
control = control_resamples(save_pred = TRUE)
)
set.seed(234)
knn_rs <- knn_spec %>%
fit_resamples(
avatar_rec,
folds,
metrics = metric_set(roc_auc, sens, spec),
control = control_resamples(save_pred = TRUE)
)
set.seed(234)
tree_rs <- tree_spec %>%
fit_resamples(
avatar_rec,
folds,
metrics = metric_set(roc_auc, sens, spec),
control = control_resamples(save_pred = TRUE)
)
glm_rs %>%
unnest(.predictions) %>%
mutate(model = "glm") %>%
bind_rows(knn_rs %>%
unnest(.predictions) %>%
mutate(model = "knn")) %>%
group_by(model) %>%
roc_curve(imdb_rating, .pred_high) %>%
ggplot(aes(x = 1 - specificity, y = sensitivity, color = model)) +
geom_line(size = 1.5) +
geom_abline(
lty = 2, alpha = 0.5,
color = "gray50",
size = 1.2
)
Logistic regression model performs a bit better, so we can look at the parameters in detail.
glm_fit %>%
tidy() %>%
dplyr::filter(p.value<0.05) %>%
arrange(-estimate)
## # A tibble: 6 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 writer_John.O.Bryan 0.830 0.198 4.19 2.75e- 5
## 2 writer_Tim.Hedrick 0.679 0.192 3.54 3.97e- 4
## 3 writer_Elizabeth.Welch.Ehasz -0.750 0.233 -3.21 1.31e- 3
## 4 writer_Michael.Dante.DiMartino -0.760 0.196 -3.87 1.10e- 4
## 5 book_num -1.54 0.295 -5.20 2.00e- 7
## 6 chapter_num -2.15 0.260 -8.28 1.24e-16
Episodes based on scripts written by John.O.Bryan and Tim Hedrick are more likely to get lower Imdb ratings and episodes based on latest books get higher Imdb ratings .