Classwork 7

Data Transformation Practice

Author

Byeong-Hak Choe

Published

March 10, 2025

Modified

April 6, 2025

Consider the flights data.frame from the nycflights13 package.

library(tidyverse)
library(nycflights13)

flights <- flights

DT::datatable(flights |> head(100))

Question 1

  • Look at the number of cancelled flights per day. Is there a pattern? Is the proportion of cancelled flights related to the average delay?
    • Assume that a flight is cancelled if either dep_delay, arr_delay, or both are missing.
Click to Check the Answer!
q1 <- flights |> 
  mutate(is_cancelled = is.na(dep_delay) | is.na(arr_delay),
         date = make_date(year,month,day)) |> 
  group_by(date, is_cancelled) |> 
  summarise(n = n(),
            dep_delay = mean(dep_delay, na.rm = T),
            arr_delay = mean(arr_delay, na.rm = T)) |> 
  mutate(prop = n / sum(n)) 
  # filter(is_cancelled == T) |> 
  # arrange(-prop)


q1_1 <- q1 |> 
   filter(is_cancelled == T) |>
   arrange(-prop)
Click to Check the Answer!
q1 |> 
  filter(is_cancelled == T) |> 
  ggplot(aes(x = date, y = prop)) +
  geom_col() +
  theme(axis.text.x = element_text(angle = 90))

Click to Check the Answer!
q1 |> 
  ggplot(aes(x = dep_delay, fill = is_cancelled)) +
  geom_density(alpha = .4)

Click to Check the Answer!
q1 |> 
  ggplot(aes(x = arr_delay, fill = is_cancelled)) +
  geom_density(alpha = .4)

Click to Check the Answer!
q1_1 |> 
  filter(prop < .2) |> 
  ggplot(aes(x = prop, y = dep_delay)) +
  geom_point(alpha = .3) +
  geom_smooth() +
  geom_smooth(method = lm, color = 'darkorange')


Question 2

  • Which carrier has the worst arrival delays?
  • Calculate the proportion of flights with an arrival delay greater than 15 minutes for each carrier and for each origin.
  • Can you disentangle the effects of bad airports vs. bad carriers? Why/why not?
Click to Check the Answer!
q2_1 <- flights |> 
  group_by(carrier) |> 
  summarise(arr_delay_max = max(arr_delay, na.rm = T),
            n = n()) |> 
  arrange(-arr_delay_max)

q2_1
# A tibble: 16 × 3
   carrier arr_delay_max     n
   <chr>           <dbl> <int>
 1 HA               1272   342
 2 MQ               1127 26397
 3 AA               1007 32729
 4 DL                931 48110
 5 F9                834   685
 6 9E                744 18460
 7 VX                676  5162
 8 EV                577 54173
 9 FL                572  3260
10 B6                497 54635
11 US                492 20536
12 UA                455 58665
13 WN                453 12275
14 YV                381   601
15 AS                198   714
16 OO                157    32
Click to Check the Answer!
q2_2 <- flights |> 
  group_by(carrier, origin) |> 
  summarize(long_arr_delay = mean(arr_delay > 15, na.rm = T)) |> 
  filter(n() == 3)
Click to Check the Answer!
q2_2 |> 
  ggplot(aes(x = carrier, y = long_arr_delay, fill = carrier)) +
  geom_col(show.legend = F) +
  facet_wrap(~origin, ncol = 1) +
  scale_fill_viridis_d()

Click to Check the Answer!
q2_2 |> 
  ggplot(aes(y = origin, x = long_arr_delay, fill = origin)) +
  geom_col(show.legend = F) +
  facet_wrap(~carrier, nrow = 1) +
  scale_fill_viridis_d()

Click to Check the Answer!
m <- lm(arr_delay ~ origin + carrier, data = flights)
m_int <- lm(arr_delay ~ origin * carrier, data = flights)

summary(m)

Call:
lm(formula = arr_delay ~ origin + carrier, data = flights)

Residuals:
    Min      1Q  Median      3Q     Max 
 -89.38  -23.77  -11.34    7.35 1278.92 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   9.4813     0.4054  23.390  < 2e-16 ***
originJFK    -2.3170     0.2577  -8.991  < 2e-16 ***
originLGA    -1.9093     0.2371  -8.052 8.15e-16 ***
carrierAA    -7.2351     0.4256 -16.999  < 2e-16 ***
carrierAS   -19.4122     1.7121 -11.338  < 2e-16 ***
carrierB6     1.9717     0.3871   5.093 3.52e-07 ***
carrierDL    -5.9238     0.4011 -14.769  < 2e-16 ***
carrierEV     6.6826     0.4356  15.342  < 2e-16 ***
carrierF9    14.3488     1.7401   8.246  < 2e-16 ***
carrierFL    12.5440     0.8747  14.341  < 2e-16 ***
carrierHA   -14.0794     2.4190  -5.820 5.87e-09 ***
carrierMQ     3.1542     0.4528   6.966 3.26e-12 ***
carrierOO     3.9641     8.2334   0.481    0.630    
carrierUA    -5.4858     0.4262 -12.873  < 2e-16 ***
carrierUS    -5.7979     0.4801 -12.076  < 2e-16 ***
carrierVX    -6.1027     0.7068  -8.634  < 2e-16 ***
carrierWN     1.1171     0.5520   2.024    0.043 *  
carrierYV     7.9851     1.9374   4.122 3.76e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 44.29 on 327328 degrees of freedom
  (9430 observations deleted due to missingness)
Multiple R-squared:  0.0153,    Adjusted R-squared:  0.01525 
F-statistic: 299.2 on 17 and 327328 DF,  p-value: < 2.2e-16
Click to Check the Answer!
summary(m_int)

Call:
lm(formula = arr_delay ~ origin * carrier, data = flights)

Residuals:
    Min      1Q  Median      3Q     Max 
 -85.32  -23.89  -11.48    7.38 1278.92 

Coefficients: (13 not defined because of singularities)
                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)           1.6153     1.2809   1.261 0.207315    
originJFK             7.2281     1.3354   5.413 6.21e-08 ***
originLGA             0.1533     1.5718   0.098 0.922310    
carrierAA            -0.6376     1.4909  -0.428 0.668927    
carrierAS           -11.5461     2.0980  -5.503 3.73e-08 ***
carrierB6             7.7733     1.3940   5.576 2.46e-08 ***
carrierDL             7.1652     1.4480   4.948 7.48e-07 ***
carrierEV            15.4074     1.2992  11.859  < 2e-16 ***
carrierF9            20.1522     1.9246  10.471  < 2e-16 ***
carrierFL            18.3474     1.2026  15.256  < 2e-16 ***
carrierHA           -15.7585     2.4220  -6.506 7.71e-11 ***
carrierMQ            14.6918     1.6045   9.157  < 2e-16 ***
carrierOO            19.8847    18.1078   1.098 0.272147    
carrierUA             1.8599     1.2976   1.433 0.151767    
carrierUS            -0.6381     1.4468  -0.441 0.659169    
carrierVX            -2.2924     1.7036  -1.346 0.178406    
carrierWN             9.4480     1.4014   6.742 1.57e-11 ***
carrierYV            13.7884     2.1043   6.552 5.67e-11 ***
originJFK:carrierAA  -6.1245     1.5841  -3.866 0.000111 ***
originLGA:carrierAA  -2.4627     1.7842  -1.380 0.167491    
originJFK:carrierAS       NA         NA      NA       NA    
originLGA:carrierAS       NA         NA      NA       NA    
originJFK:carrierB6  -7.7230     1.4604  -5.288 1.24e-07 ***
originLGA:carrierB6   3.9695     1.7619   2.253 0.024260 *  
originJFK:carrierDL -18.3878     1.5278 -12.035  < 2e-16 ***
originLGA:carrierDL  -5.0060     1.7356  -2.884 0.003923 ** 
originJFK:carrierEV  -6.4619     1.8184  -3.554 0.000380 ***
originLGA:carrierEV  -7.8960     1.6600  -4.757 1.97e-06 ***
originJFK:carrierF9       NA         NA      NA       NA    
originLGA:carrierF9       NA         NA      NA       NA    
originJFK:carrierFL       NA         NA      NA       NA    
originLGA:carrierFL       NA         NA      NA       NA    
originJFK:carrierHA       NA         NA      NA       NA    
originLGA:carrierHA       NA         NA      NA       NA    
originJFK:carrierMQ -11.0665     1.7329  -6.386 1.70e-10 ***
originLGA:carrierMQ  -7.1255     1.8777  -3.795 0.000148 ***
originJFK:carrierOO       NA         NA      NA       NA    
originLGA:carrierOO -12.2185    20.3428  -0.601 0.548087    
originJFK:carrierUA  -8.1928     1.5045  -5.446 5.17e-08 ***
originLGA:carrierUA   1.0137     1.6627   0.610 0.542067    
originJFK:carrierUS  -6.0912     1.7018  -3.579 0.000345 ***
originLGA:carrierUS   1.4004     1.7548   0.798 0.424835    
originJFK:carrierVX  -3.7232     1.8957  -1.964 0.049534 *  
originLGA:carrierVX       NA         NA      NA       NA    
originJFK:carrierWN       NA         NA      NA       NA    
originLGA:carrierWN  -2.9976     1.7666  -1.697 0.089727 .  
originJFK:carrierYV       NA         NA      NA       NA    
originLGA:carrierYV       NA         NA      NA       NA    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 44.24 on 327311 degrees of freedom
  (9430 observations deleted due to missingness)
Multiple R-squared:  0.01748,   Adjusted R-squared:  0.01738 
F-statistic: 171.3 on 34 and 327311 DF,  p-value: < 2.2e-16


Question 3

  • Find all destinations that are flown by at least two carriers. Use that information to rank the carriers.
Click to Check the Answer!
q3_1 <- flights |> 
  group_by(carrier, dest) |> 
  summarize(n_flights = n()) |> 
  summarise(n_dests = n()) |> 
  arrange(-n_dests)


q3_2 <- flights |> 
  distinct(carrier, dest) |> 
  count(carrier) |> 
  arrange(-n)

q3_3 <- flights |> 
  distinct(carrier, dest) |> 
  count(dest) |> 
  arrange(-n) |> 
  filter(n >= 2)


Question 4

  • The following is the data.frame for Question 4.
holiday_movies <- read_csv("https://bcdanl.github.io/data/holiday_movies.csv")
  • The data.frame holiday_movies comes from the Internet Movie Database (IMDb).

  • The following is the data.frame, holiday_movies.

ABCDEFGHIJ0123456789
tconst
<chr>
title_type
<chr>
primary_title
<chr>
tt0020356movieSailor's Holiday
tt0020823movieThe Devil's Holiday
tt0020985movieHoliday
tt0021268movieHoliday of St. Jorgen
tt0021377movieSin Takes a Holiday
tt0021381movieSinners' Holiday
tt0023039movieHusband's Holiday
tt0024869movieBeggar's Holiday
tt0025006movieCowboy Holiday
tt0025037movieDeath Takes a Holiday
tt0027456movieCollege Holiday
tt0027923movieMad Holiday
tt0028310movieStolen Holiday
tt0028576movieAngel's Holiday
tt0028763movieDangerous Holiday
tt0028843movieEvery Day's a Holiday
tt0029894movieBank Holiday
tt0029992movieA Christmas Carol
tt0030027movieCrime Takes a Holiday
tt0030241movieHoliday

Variable description

  • tconst: alphanumeric unique identifier of the title

  • title_type: the type/format of the title

    • (movie, video, or tvMovie)
  • primary_title: the more popular title / the title used by the filmmakers on promotional materials at the point of release

  • simple_title: the title in lowercase, with punctuation removed, for easier filtering and grouping

  • year: the release year of a title

  • runtime_minutes: primary runtime of the title, in minutes

  • average_rating: weighted average of all the individual user ratings on IMDb

  • num_votes: number of votes the title has received on IMDb (titles with fewer than 10 votes were not included in this dataset)



  • The following is another data.frame holiday_movie_genres that is related with the data.frame holiday_movies:
holiday_movie_genres <- read_csv("https://bcdanl.github.io/data/holiday_movie_genres.csv")
ABCDEFGHIJ0123456789
tconst
<chr>
genres
<chr>
tt0020356Comedy
tt0020823Drama
tt0020823Romance
tt0020985Comedy
tt0020985Drama
tt0021268Comedy
tt0021377Comedy
tt0021377Romance
tt0021381Adventure
tt0021381Crime
tt0021381Romance
tt0023039Drama
tt0024869Crime
tt0024869Drama
tt0024869Romance
tt0025006Western
tt0025037Drama
tt0025037Fantasy
tt0025037Romance
tt0027456Comedy
  • The data.frame holiday_movie_genres include up to three genres associated with the titles that appear in the data.frame.

Variable description

  • tconst: alphanumeric unique identifier of the title
  • genres: genres associated with the title, one row per genre



Q4a.

  • Provide the R code to generate the data.frame, holiday_movie_with_genres, which combines the two data.frames, holiday_movies and holiday_movie_genres:
Click to Check the Answer!
holiday_movies_with_genres <- holiday_movie_genres |> 
  left_join(holiday_movies)
  • The following shows the first four variables in holiday_movie_with_genres:
ABCDEFGHIJ0123456789
tconst
<chr>
genres
<chr>
title_type
<chr>
primary_title
<chr>
simple_title
<chr>
year
<dbl>
tt0020356ComedymovieSailor's Holidaysailors holiday1929
tt0020823DramamovieThe Devil's Holidaythe devils holiday1930
tt0020823RomancemovieThe Devil's Holidaythe devils holiday1930
tt0020985ComedymovieHolidayholiday1930
tt0020985DramamovieHolidayholiday1930
tt0021268ComedymovieHoliday of St. Jorgenholiday of st jorgen1930
tt0021377ComedymovieSin Takes a Holidaysin takes a holiday1930
tt0021377RomancemovieSin Takes a Holidaysin takes a holiday1930
tt0021381AdventuremovieSinners' Holidaysinners holiday1930
tt0021381CrimemovieSinners' Holidaysinners holiday1930
tt0021381RomancemovieSinners' Holidaysinners holiday1930
tt0023039DramamovieHusband's Holidayhusbands holiday1931
tt0024869CrimemovieBeggar's Holidaybeggars holiday1934
tt0024869DramamovieBeggar's Holidaybeggars holiday1934
tt0024869RomancemovieBeggar's Holidaybeggars holiday1934
tt0025006WesternmovieCowboy Holidaycowboy holiday1934
tt0025037DramamovieDeath Takes a Holidaydeath takes a holiday1934
tt0025037FantasymovieDeath Takes a Holidaydeath takes a holiday1934
tt0025037RomancemovieDeath Takes a Holidaydeath takes a holiday1934
tt0027456ComedymovieCollege Holidaycollege holiday1936



Q4b.

  • Provide the R code using skimr::skim() to see how the summary statistics—mean, median, standard deviation, minimum, maximum, first and third quartiles—of average_rating and num_votes varies by popular genres and title_type.

    • Consider only the five popular genres, which are selected in terms of the number of titles for each genre.
    • Removes the video type of the titles when calculating the summary statistics.
Click to Check the Answer!
popular_genres <- holiday_movies_with_genres |> 
  group_by(genres) |> 
  count() |> 
  ungroup() |> 
  slice_max(n, n = 5)

holiday_movies_with_genres |> 
  filter(genres %in% popular_genres$genres,
         title_type != 'video') |> 
  group_by(genres, title_type) |> 
  skimr::skim(average_rating, num_votes) 



Q4c.

  • Provide R code to recreate the ggplot figure illustrating how the relationship between log10(num_votes) and average_rating varies by the popular genres and title_type.
    • The five popular genres are selected in terms of the number of titles for each genre.
    • The video type of the titles are removed in the ggplot figure.
Click to Check the Answer!
popular_genres <- holiday_movies_with_genres |> 
  group_by(genres) |> 
  count() |> 
  ungroup() |> 
  slice_max(n, n = 5)

holiday_movies_with_genres |> 
  filter(genres %in% popular_genres$genres,
         title_type != 'video') |> 
  group_by(genres) |> 
  mutate(mean_rating = mean(average_rating, na.rm = T)) |> 
  ggplot(aes(y = average_rating, x = log10(num_votes))) +
  geom_point(alpha = .2) +
  geom_smooth(aes(color = genres),
              method = lm) +
  # coord_cartesian(ylim = c(5,7)) +
  facet_grid(title_type~ genres, scales = "free")



Q4d.

  • Provide a comment to illustrate how the relationship between log10(num_votes) and average_rating varies by the popular genres and title_type.



Q4e.

  • Provide R code to recreate the ggplot figure illustrating the annual trend of the share of number of movies by popular genre from year 1975 to 2022.

    • For genres that are not popular, categorize them as “Other”.
      • Consider reordering the categories in genres in descending order based on their share in the year 2022.
      • Use “Set2” color palette from the RColorBrewer package.
Click to Check the Answer!
holiday_movies_with_genres |> 
  mutate(genres = ifelse( !(genres %in% popular_genres$genres), "Other", genres )) |> 
  group_by(year, genres) |> 
  count() |> 
  filter(year >= 1975, year <= 2022) |> 
  ggplot() +
  geom_col(aes(x = year, y = n, 
               fill = fct_reorder2(genres, year, n)), 
           position = 'fill', 
           width = rel(1.25),
           color = 'transparent') +
  labs(y = "Share of number of movies by genre",
       fill = "Genre", x = "",
       title = "How Have Christmas Movie Genres Evolved Over Time?") +
  scale_fill_brewer(palette = "Set2") +
  hrbrthemes::scale_y_percent() +
  guides(fill = guide_legend(title.position = "top",
                             # label.position = "bottom",
                             keywidth = 3,
                             keyheight = rel(3.4),
                             ncol = 1)) +
  theme(legend.position = "right",
        legend.title = 
          element_text(face = "bold.italic",
                       size = rel(1.25),
                       hjust = .5),
        legend.text = 
          element_text(face = "italic",
                       size = rel(1.25),
                       hjust = .5),
        legend.box.margin = margin(-20, 0, 0, -20),
        panel.grid.major.x = element_blank(),
        panel.grid.major.y = element_blank(),
        axis.text.y = element_text(margin = margin(0,-20,0,0)),
        axis.title.x = element_text(face = "bold",
                                    size = rel(1.5)),
        axis.title.y = element_text(face = "bold",
                                    size = rel(1.5)),
        plot.title = element_text(
          margin = margin(0,0,20,0),
          hjust = .33
        )
        )

  • c.f.) The following uses geom_area() instead:
Click to Check the Answer!
df <- holiday_movies_with_genres |> 
  mutate(genres = ifelse(!(genres %in% popular_genres$genres), "Other", genres)) |> 
  group_by(year, genres) |> 
  count() |> 
  filter(year >= 1975, year <= 2022) |> 
  group_by(year) |> 
  mutate(prop = n / sum(n)) |>  # Convert count to proportion
  ungroup()

# df2 <- df |> 
#   group_by(year) |> 
#   summarise(sum = sum(prop))

df |> 
  ggplot(aes(x = year, y = prop, fill = fct_reorder2(genres, year, prop))) +
  geom_area(position = "fill") +  # Stack by genre
  labs(y = "Share of number of movies by genre",
       fill = "Genre", x = "",
       title = "How Have Christmas Movie Genres Evolved Over Time?") +
  scale_fill_brewer(palette = "Set2") +
  hrbrthemes::scale_y_percent() +
  guides(fill = guide_legend(title.position = "top",
                             # label.position = "bottom",
                             keywidth = 3,
                             keyheight = rel(3.4),
                             ncol = 1)) +
  theme(legend.position = "right",
        legend.title = element_text(face = "bold.italic",
                                    size = rel(1.25),
                                    hjust = .5),
        legend.text = element_text(face = "italic",
                                   size = rel(1.25),
                                   hjust = .5),
        legend.box.margin = margin(-20, 0, 0, -20),
        panel.grid.major.x = element_blank(),
        panel.grid.major.y = element_blank(),
        axis.text.y = element_text(margin = margin(0, -20, 0, 0)),
        axis.title.x = element_text(face = "bold",
                                    size = rel(1.5)),
        axis.title.y = element_text(face = "bold",
                                    size = rel(1.5)),
        plot.title = element_text(
          margin = margin(0,0,20,0),
          hjust = .33
        )
  )


Q4f.

  • Provide a comment to illustrate the annual trend of (1) the share of number of movies by popular genre from year 1975 to 2022.

    • Which genre has become more popular since 2010?



Q4g.

  • Add the following two variables—christmas and holiday—to the data.frame holiday_movies_with_genres:

  • christmas:

    • TRUE if the simple_title includes “christmas”, “xmas”, “x mas”
    • FALSE otherwise
  • holiday:

    • TRUE if the simple_title includes “holiday”
    • FALSE otherwise
Click to Check the Answer!
holiday_movies_with_genres <- holiday_movies_with_genres |> 
  mutate(christmas = ifelse(str_detect(simple_title, "christmas") |
                              str_detect(simple_title, "xmas") |
                              str_detect(simple_title, "x mas"),
                            T, F),
         holiday = ifelse(str_detect(simple_title, "holiday"),
                            T, F),
         )



Q4h.

  • Provide R code to recreate the ggplot figure illustrating the annual trend of (1) the number of movie titles with “holiday” varies by christmas.
Click to Check the Answer!
holiday_movies_with_genres |> 
  group_by(year, christmas, holiday) |> 
  count() |> 
  ggplot(aes(x = year, y = n, color = holiday)) +
  geom_smooth() +
  geom_point(alpha = .33) +
  facet_wrap(christmas~., scales = "free")



Q4i.

  • Provide R code to recreate the ggplot figure illustrating how the mean value of num_votes varies by the popular genres for the titles with “christmas”.
Click to Check the Answer!
holiday_movies_with_genres |> 
  filter(genres %in% popular_genres$genres) |> 
  group_by(genres, christmas) |> 
  summarise(mean_rating = mean(average_rating),
            mean_votes = mean(num_votes)) |> 
  filter(christmas == T) |> 
  ggplot(aes(x = mean_votes, y = fct_reorder(genres, mean_votes),
             )) +
  geom_point(size = 2) +
  labs(y = "genres")


Question 5

  • The following is the data.frame for Question 5.
tripadvisor <- read_csv("https://bcdanl.github.io/data/tripadvisor_cleaned.csv")
  • TripAdvisor is an online travel research company that empowers people around the world to plan and enjoy the ideal trip.

  • TripAdvisor wanted to know whether promoting membership on their platform could drive engagement and bookings.

  • To do so, TripAdvisor had just run an experiment to explore user retention by offering a random subset of customers an easier sign-up process for membership.

  • The following is the data.frame, tripadvisor.

ABCDEFGHIJ0123456789
id
<dbl>
time
<chr>
easier_signup
<lgl>
days_visited
<dbl>
became_member
<lgl>
locale_en_US
<lgl>
1PREFALSE1FALSETRUE
1POSTFALSE1FALSETRUE
2PREFALSE10FALSEFALSE
2POSTFALSE15FALSEFALSE
3PREFALSE18FALSETRUE
3POSTFALSE17FALSETRUE
4PREFALSE17FALSETRUE
4POSTFALSE6FALSETRUE
5PREFALSE24FALSETRUE
5POSTFALSE12FALSETRUE
6PRETRUE11TRUETRUE
6POSTTRUE16TRUETRUE
7PREFALSE15FALSEFALSE
7POSTFALSE3FALSEFALSE
8PREFALSE11FALSETRUE
8POSTFALSE1FALSETRUE
9PRETRUE27TRUEFALSE
9POSTTRUE14TRUEFALSE
10PRETRUE0TRUEFALSE
10POSTTRUE1TRUEFALSE

Variable description

  • id: a unique identifier for a user.

  • time:

    • PRE if time is before the experiment;
    • POST if time is in the 28 days after the experiment.
    • For each id value, there are two observations—one with time == “PRE” and the other with time == “POST”.
  • days_visited: Number of days a user visited the TripAdvisor website.

  • easier_signup:

    • TRUE if a user was exposed to the easier signup process (e.g., one-click signup) during the experiment;
    • FALSE otherwise.
  • became_member:

    • TRUE if a user became a member during the experiment period;
    • FALSE otherwise.
  • locale_en_US:

    • TRUE if a user accessed the website from the US;
    • FALSE otherwise.
  • os_type: Windows, Mac, or Others

  • revenue_pre: Amount of dollars a user spent on the website before the experiment


Q5a.

  • Using the given data.frame, tripadvisor, create the data.frame, tripadvisor, for which

    • time is a factor-type variable of time with the first level, “PRE”.
Click to Check the Answer!
tripadvisor <- tripadvisor |> 
  mutate(time = factor(time, levels = c("PRE", "POST")),
         os_type = factor(os_type, levels = c("Windows", "Mac", "Others") ) )



Q5b.

  • Provide R code to recreate the ggplot figure illustrating how the relationship between time and days_visited varies by easier_signup and became_member.

    • Row-wise split is based on easier_signup.
    • Column-wise split is based on became_member.
Click to Check the Answer!
ggplot(tripadvisor,
       aes(y = days_visited, 
                   x = time)) +
  geom_boxplot(aes(fill = time) ) +
  facet_grid(easier_signup ~ became_member)



Q5c.

  • Provide a comment to illustrate how the relationship between time and days_visited varies by easier_signup and became_member.



Q5d.

  • Provide a R code to create the data.frame Q5d that includes the variable diff, the difference between (1) the value of days_visited for time == PRE and (2) the value of days_visited for time == POST for each id.
Click to Check the Answer!
q2d <- tripadvisor_wide <- tripadvisor |> 
  pivot_wider(names_from = time,
              values_from = days_visited) |> 
  relocate(PRE, POST, .after = id) |> 
  mutate(diff = POST - PRE, .before = PRE) |> 
  select(id:became_member)
  • The resulting data.frame should look as follows:
ABCDEFGHIJ0123456789
id
<dbl>
diff
<dbl>
PRE
<dbl>
POST
<dbl>
easier_signup
<lgl>
became_member
<lgl>
1011FALSEFALSE
251015FALSEFALSE
3-11817FALSEFALSE
4-11176FALSEFALSE
5-122412FALSEFALSE
651116TRUETRUE
7-12153FALSEFALSE
8-10111FALSEFALSE
9-132714TRUETRUE
10101TRUETRUE
11-17236TRUETRUE
12-143FALSEFALSE
132810FALSEFALSE
1461016TRUETRUE
15-9123FALSEFALSE
1651116TRUETRUE
17-20200TRUEFALSE
1841115FALSEFALSE
1901515TRUEFALSE
20-32219TRUETRUE
  • Include only the variables as shown above.
  • Adjust the order of the variables as shown above.



Q5e.

  • Provide an R code to calculate how the difference in the mean value of diff varies by easier_signup and became_member using the data.frame Q5d.
Click to Check the Answer!
q2e <- q2d |> 
  group_by(easier_signup, became_member) |> 
  summarise(mean_diff = round(mean(diff),2))
  • The resulting data.frame should look as follows:
ABCDEFGHIJ0123456789
easier_signup
<lgl>
became_member
<lgl>
mean_diff
<dbl>
FALSEFALSE-3.89
FALSETRUE-2.60
TRUEFALSE-5.71
TRUETRUE-0.62



Q5f.

  • Using the resulting data.frame in Q5e, discuss the following question:
    • What is the effect of easier signup process on the number of days a user visited the TripAdvisor website?
    • How does this effect varies by the status of became_member?



Question 6

In September 2019, YouGov survey asked 1,639 GB adults the following question:

In hindsight, do you think Britain was right/wrong to vote to leave EU?

  • Right to leave
  • Wrong to leave
  • Don’t know

The data from the survey is in brexit.csv.

brexit <- read_csv('https://bcdanl.github.io/data/brexit.csv')
datatable(brexit)


Q6a

  • Replicate the following visualization
Click to Check the Answer!
brexit <- brexit |> 
  mutate(
    region = fct_relevel(region, 
                         "london", "rest_of_south", "midlands_wales", "north", "scot"),
    region = fct_recode(region, 
                        London = "london", 
                        `Rest of South` = "rest_of_south", 
                        `Midlands / Wales` = "midlands_wales", 
                        North = "north", 
                        Scotland = "scot")
  )

ggplot(brexit, 
       aes(y = opinion, fill = opinion)) +
  geom_bar() +
  facet_wrap( ~ region, 
             nrow = 1, 
             labeller = label_wrap_gen(width = 12)) +
  guides(fill = "none") +
  labs(
    title = "Was Britain right/wrong to vote to leave EU?",
    subtitle = "YouGov Survey Results, 2-3 September 2019",
    caption = "Source: bit.ly/2lCJZVg",
    x = NULL, y = NULL
  ) +
  scale_fill_manual(values = c(
    "gray",
    "#67a9cf",
    "#ef8a62"
  )) +
  theme_minimal()



Q6b

  • Replicate the following visualization
    • How is the story this visualization telling different than the story the plot in Q6a?
Click to Check the Answer!
ggplot(brexit, 
       aes(y = opinion, fill = opinion)) +
  geom_bar() +
  facet_wrap(~region, scales = 'free_x',
    nrow = 1, labeller = label_wrap_gen(width = 12),
    # ___
  ) +
  guides(fill = "none") +
  labs(
    title = "Was Britain right/wrong to vote to leave EU?",
    subtitle = "YouGov Survey Results, 2-3 September 2019",
    caption = "Source: bit.ly/2lCJZVg",
    x = NULL, y = NULL
  ) +
  scale_fill_manual(values = c(
    "Wrong" = "#ef8a62",
    "Right" = "#67a9cf",
    "Don't know" = "gray"
  )) +
  theme_minimal()



Q6c

  • First, calculate the proportion of wrong, right, and don’t know answers in each region and then plot these proportions (rather than the counts) and then improve axis labeling.
Click to Check the Answer!
q6 <- brexit |> 
  group_by(region, opinion) |>  
  summarise(n = n()) |> 
  mutate(tot = sum(n),
         prop = n / tot ) 
  • Replicate the following visualization
    • How is the story this visualization telling different than the story the plot in Q4b?
Click to Check the Answer!
ggplot(q6, aes(y = opinion, x = prop,
               fill = opinion)) +
  geom_col() +
  facet_wrap(~region,
    nrow = 1, labeller = label_wrap_gen(width = 12),
    # ___
  ) +
  guides(fill = "none") +
  labs(
    title = "Was Britain right/wrong to vote to leave EU?",
    subtitle = "YouGov Survey Results, 2-3 September 2019",
    caption = "Source: bit.ly/2lCJZVg",
    x = 'Percent', y = NULL
  ) +
  scale_fill_manual(values = c(
    "Wrong" = "#ef8a62",
    "Right" = "#67a9cf",
    "Don't know" = "gray"
  )) +
  scale_x_continuous(labels = scales::percent) +
  theme_minimal()



Q6d.

  • Recreate the same visualization from the previous exercise, this time dodging the bars for opinion proportions for each region, rather than faceting by region and then improve the legend.

    • How is the story this visualization telling different than the story the previous plot tells?
Click to Check the Answer!
ggplot(q6, aes(y = region, x = prop,
               fill = opinion)) +
  geom_col(position = "dodge") +
  labs(
    title = "Was Britain right/wrong to vote to leave EU?",
    subtitle = "YouGov Survey Results, 2-3 September 2019",
    caption = "Source: bit.ly/2lCJZVg",
    x = 'Percent', y = NULL, fill = 'Opinion'
  ) +
  scale_fill_manual(values = c(
    "Wrong" = "#ef8a62",
    "Right" = "#67a9cf",
    "Don't know" = "gray"
  )) +
  scale_x_continuous(labels = scales::percent) +
  theme_minimal() 

Discussion

Welcome to our Classwork 7 Discussion Board! 👋

This space is designed for you to engage with your classmates about the material covered in Classwork 7.

Whether you are looking to delve deeper into the content, share insights, or have questions about the content, this is the perfect place for you.

If you have any specific questions for Byeong-Hak (@bcdanl) regarding the Classwork 7 materials or need clarification on any points, don’t hesitate to ask here.

All comments will be stored here.

Let’s collaborate and learn from each other!

Back to top