Classwork 7

Data Transformation Practice

Author

Byeong-Hak Choe

Published

March 10, 2025

Modified

April 6, 2025

Consider the flights data.frame from the nycflights13 package.

library(tidyverse)
library(nycflights13)

flights <- flights

DT::datatable(flights |> head(100))

Question 1

Look at the number of cancelled flights per day. Is there a pattern? Is the proportion of cancelled flights related to the average delay?
- Assume that a flight is cancelled if either dep_delay, arr_delay, or both are missing.

Click to Check the Answer!

q1 <- flights |> 
  mutate(is_cancelled = is.na(dep_delay) | is.na(arr_delay),
         date = make_date(year,month,day)) |> 
  group_by(date, is_cancelled) |> 
  summarise(n = n(),
            dep_delay = mean(dep_delay, na.rm = T),
            arr_delay = mean(arr_delay, na.rm = T)) |> 
  mutate(prop = n / sum(n)) 
  # filter(is_cancelled == T) |> 
  # arrange(-prop)


q1_1 <- q1 |> 
   filter(is_cancelled == T) |>
   arrange(-prop)

Click to Check the Answer!

q1 |> 
  filter(is_cancelled == T) |> 
  ggplot(aes(x = date, y = prop)) +
  geom_col() +
  theme(axis.text.x = element_text(angle = 90))

Click to Check the Answer!

q1 |> 
  ggplot(aes(x = dep_delay, fill = is_cancelled)) +
  geom_density(alpha = .4)

Click to Check the Answer!

q1 |> 
  ggplot(aes(x = arr_delay, fill = is_cancelled)) +
  geom_density(alpha = .4)

Click to Check the Answer!

q1_1 |> 
  filter(prop < .2) |> 
  ggplot(aes(x = prop, y = dep_delay)) +
  geom_point(alpha = .3) +
  geom_smooth() +
  geom_smooth(method = lm, color = 'darkorange')

Question 2

Which carrier has the worst arrival delays?
Calculate the proportion of flights with an arrival delay greater than 15 minutes for each carrier and for each origin.
Can you disentangle the effects of bad airports vs. bad carriers? Why/why not?

Click to Check the Answer!

q2_1 <- flights |> 
  group_by(carrier) |> 
  summarise(arr_delay_max = max(arr_delay, na.rm = T),
            n = n()) |> 
  arrange(-arr_delay_max)

q2_1

# A tibble: 16 × 3
   carrier arr_delay_max     n
   <chr>           <dbl> <int>
 1 HA               1272   342
 2 MQ               1127 26397
 3 AA               1007 32729
 4 DL                931 48110
 5 F9                834   685
 6 9E                744 18460
 7 VX                676  5162
 8 EV                577 54173
 9 FL                572  3260
10 B6                497 54635
11 US                492 20536
12 UA                455 58665
13 WN                453 12275
14 YV                381   601
15 AS                198   714
16 OO                157    32

Click to Check the Answer!

q2_2 <- flights |> 
  group_by(carrier, origin) |> 
  summarize(long_arr_delay = mean(arr_delay > 15, na.rm = T)) |> 
  filter(n() == 3)

Click to Check the Answer!

q2_2 |> 
  ggplot(aes(x = carrier, y = long_arr_delay, fill = carrier)) +
  geom_col(show.legend = F) +
  facet_wrap(~origin, ncol = 1) +
  scale_fill_viridis_d()

Click to Check the Answer!

q2_2 |> 
  ggplot(aes(y = origin, x = long_arr_delay, fill = origin)) +
  geom_col(show.legend = F) +
  facet_wrap(~carrier, nrow = 1) +
  scale_fill_viridis_d()

Click to Check the Answer!

m <- lm(arr_delay ~ origin + carrier, data = flights)
m_int <- lm(arr_delay ~ origin * carrier, data = flights)

summary(m)


Call:
lm(formula = arr_delay ~ origin + carrier, data = flights)

Residuals:
    Min      1Q  Median      3Q     Max 
 -89.38  -23.77  -11.34    7.35 1278.92 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   9.4813     0.4054  23.390  < 2e-16 ***
originJFK    -2.3170     0.2577  -8.991  < 2e-16 ***
originLGA    -1.9093     0.2371  -8.052 8.15e-16 ***
carrierAA    -7.2351     0.4256 -16.999  < 2e-16 ***
carrierAS   -19.4122     1.7121 -11.338  < 2e-16 ***
carrierB6     1.9717     0.3871   5.093 3.52e-07 ***
carrierDL    -5.9238     0.4011 -14.769  < 2e-16 ***
carrierEV     6.6826     0.4356  15.342  < 2e-16 ***
carrierF9    14.3488     1.7401   8.246  < 2e-16 ***
carrierFL    12.5440     0.8747  14.341  < 2e-16 ***
carrierHA   -14.0794     2.4190  -5.820 5.87e-09 ***
carrierMQ     3.1542     0.4528   6.966 3.26e-12 ***
carrierOO     3.9641     8.2334   0.481    0.630    
carrierUA    -5.4858     0.4262 -12.873  < 2e-16 ***
carrierUS    -5.7979     0.4801 -12.076  < 2e-16 ***
carrierVX    -6.1027     0.7068  -8.634  < 2e-16 ***
carrierWN     1.1171     0.5520   2.024    0.043 *  
carrierYV     7.9851     1.9374   4.122 3.76e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 44.29 on 327328 degrees of freedom
  (9430 observations deleted due to missingness)
Multiple R-squared:  0.0153,    Adjusted R-squared:  0.01525 
F-statistic: 299.2 on 17 and 327328 DF,  p-value: < 2.2e-16

Click to Check the Answer!

summary(m_int)


Call:
lm(formula = arr_delay ~ origin * carrier, data = flights)

Residuals:
    Min      1Q  Median      3Q     Max 
 -85.32  -23.89  -11.48    7.38 1278.92 

Coefficients: (13 not defined because of singularities)
                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)           1.6153     1.2809   1.261 0.207315    
originJFK             7.2281     1.3354   5.413 6.21e-08 ***
originLGA             0.1533     1.5718   0.098 0.922310    
carrierAA            -0.6376     1.4909  -0.428 0.668927    
carrierAS           -11.5461     2.0980  -5.503 3.73e-08 ***
carrierB6             7.7733     1.3940   5.576 2.46e-08 ***
carrierDL             7.1652     1.4480   4.948 7.48e-07 ***
carrierEV            15.4074     1.2992  11.859  < 2e-16 ***
carrierF9            20.1522     1.9246  10.471  < 2e-16 ***
carrierFL            18.3474     1.2026  15.256  < 2e-16 ***
carrierHA           -15.7585     2.4220  -6.506 7.71e-11 ***
carrierMQ            14.6918     1.6045   9.157  < 2e-16 ***
carrierOO            19.8847    18.1078   1.098 0.272147    
carrierUA             1.8599     1.2976   1.433 0.151767    
carrierUS            -0.6381     1.4468  -0.441 0.659169    
carrierVX            -2.2924     1.7036  -1.346 0.178406    
carrierWN             9.4480     1.4014   6.742 1.57e-11 ***
carrierYV            13.7884     2.1043   6.552 5.67e-11 ***
originJFK:carrierAA  -6.1245     1.5841  -3.866 0.000111 ***
originLGA:carrierAA  -2.4627     1.7842  -1.380 0.167491    
originJFK:carrierAS       NA         NA      NA       NA    
originLGA:carrierAS       NA         NA      NA       NA    
originJFK:carrierB6  -7.7230     1.4604  -5.288 1.24e-07 ***
originLGA:carrierB6   3.9695     1.7619   2.253 0.024260 *  
originJFK:carrierDL -18.3878     1.5278 -12.035  < 2e-16 ***
originLGA:carrierDL  -5.0060     1.7356  -2.884 0.003923 ** 
originJFK:carrierEV  -6.4619     1.8184  -3.554 0.000380 ***
originLGA:carrierEV  -7.8960     1.6600  -4.757 1.97e-06 ***
originJFK:carrierF9       NA         NA      NA       NA    
originLGA:carrierF9       NA         NA      NA       NA    
originJFK:carrierFL       NA         NA      NA       NA    
originLGA:carrierFL       NA         NA      NA       NA    
originJFK:carrierHA       NA         NA      NA       NA    
originLGA:carrierHA       NA         NA      NA       NA    
originJFK:carrierMQ -11.0665     1.7329  -6.386 1.70e-10 ***
originLGA:carrierMQ  -7.1255     1.8777  -3.795 0.000148 ***
originJFK:carrierOO       NA         NA      NA       NA    
originLGA:carrierOO -12.2185    20.3428  -0.601 0.548087    
originJFK:carrierUA  -8.1928     1.5045  -5.446 5.17e-08 ***
originLGA:carrierUA   1.0137     1.6627   0.610 0.542067    
originJFK:carrierUS  -6.0912     1.7018  -3.579 0.000345 ***
originLGA:carrierUS   1.4004     1.7548   0.798 0.424835    
originJFK:carrierVX  -3.7232     1.8957  -1.964 0.049534 *  
originLGA:carrierVX       NA         NA      NA       NA    
originJFK:carrierWN       NA         NA      NA       NA    
originLGA:carrierWN  -2.9976     1.7666  -1.697 0.089727 .  
originJFK:carrierYV       NA         NA      NA       NA    
originLGA:carrierYV       NA         NA      NA       NA    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 44.24 on 327311 degrees of freedom
  (9430 observations deleted due to missingness)
Multiple R-squared:  0.01748,   Adjusted R-squared:  0.01738 
F-statistic: 171.3 on 34 and 327311 DF,  p-value: < 2.2e-16

Question 3

Find all destinations that are flown by at least two carriers. Use that information to rank the carriers.

Click to Check the Answer!

q3_1 <- flights |> 
  group_by(carrier, dest) |> 
  summarize(n_flights = n()) |> 
  summarise(n_dests = n()) |> 
  arrange(-n_dests)


q3_2 <- flights |> 
  distinct(carrier, dest) |> 
  count(carrier) |> 
  arrange(-n)

q3_3 <- flights |> 
  distinct(carrier, dest) |> 
  count(dest) |> 
  arrange(-n) |> 
  filter(n >= 2)

Question 4

The following is the data.frame for Question 4.

holiday_movies <- read_csv("https://bcdanl.github.io/data/holiday_movies.csv")

The data.frame holiday_movies comes from the Internet Movie Database (IMDb).
The following is the data.frame, holiday_movies.

Variable description

tconst: alphanumeric unique identifier of the title
title_type: the type/format of the title
- (movie, video, or tvMovie)
primary_title: the more popular title / the title used by the filmmakers on promotional materials at the point of release
simple_title: the title in lowercase, with punctuation removed, for easier filtering and grouping
year: the release year of a title
runtime_minutes: primary runtime of the title, in minutes
average_rating: weighted average of all the individual user ratings on IMDb
num_votes: number of votes the title has received on IMDb (titles with fewer than 10 votes were not included in this dataset)

The following is another data.frame holiday_movie_genres that is related with the data.frame holiday_movies:

holiday_movie_genres <- read_csv("https://bcdanl.github.io/data/holiday_movie_genres.csv")

The data.frame holiday_movie_genres include up to three genres associated with the titles that appear in the data.frame.

Variable description

tconst: alphanumeric unique identifier of the title
genres: genres associated with the title, one row per genre

Q4a.

Provide the R code to generate the data.frame, holiday_movie_with_genres, which combines the two data.frames, holiday_movies and holiday_movie_genres:

Click to Check the Answer!

holiday_movies_with_genres <- holiday_movie_genres |> 
  left_join(holiday_movies)

The following shows the first four variables in holiday_movie_with_genres:

Q4b.

Provide the R code using skimr::skim() to see how the summary statistics—mean, median, standard deviation, minimum, maximum, first and third quartiles—of average_rating and num_votes varies by popular genres and title_type.
- Consider only the five popular genres, which are selected in terms of the number of titles for each genre.
- Removes the video type of the titles when calculating the summary statistics.

Click to Check the Answer!

popular_genres <- holiday_movies_with_genres |> 
  group_by(genres) |> 
  count() |> 
  ungroup() |> 
  slice_max(n, n = 5)

holiday_movies_with_genres |> 
  filter(genres %in% popular_genres$genres,
         title_type != 'video') |> 
  group_by(genres, title_type) |> 
  skimr::skim(average_rating, num_votes)

Q4c.

Provide R code to recreate the ggplot figure illustrating how the relationship between log10(num_votes) and average_rating varies by the popular genres and title_type.
- The five popular genres are selected in terms of the number of titles for each genre.
- The video type of the titles are removed in the ggplot figure.

Click to Check the Answer!

popular_genres <- holiday_movies_with_genres |> 
  group_by(genres) |> 
  count() |> 
  ungroup() |> 
  slice_max(n, n = 5)

holiday_movies_with_genres |> 
  filter(genres %in% popular_genres$genres,
         title_type != 'video') |> 
  group_by(genres) |> 
  mutate(mean_rating = mean(average_rating, na.rm = T)) |> 
  ggplot(aes(y = average_rating, x = log10(num_votes))) +
  geom_point(alpha = .2) +
  geom_smooth(aes(color = genres),
              method = lm) +
  # coord_cartesian(ylim = c(5,7)) +
  facet_grid(title_type~ genres, scales = "free")

Q4d.

Provide a comment to illustrate how the relationship between log10(num_votes) and average_rating varies by the popular genres and title_type.

Q4e.

Provide R code to recreate the ggplot figure illustrating the annual trend of the share of number of movies by popular genre from year 1975 to 2022.
- For genres that are not popular, categorize them as “Other”.
  - Consider reordering the categories in genres in descending order based on their share in the year 2022.
  - Use “Set2” color palette from the RColorBrewer package.

Click to Check the Answer!

holiday_movies_with_genres |> 
  mutate(genres = ifelse( !(genres %in% popular_genres$genres), "Other", genres )) |> 
  group_by(year, genres) |> 
  count() |> 
  filter(year >= 1975, year <= 2022) |> 
  ggplot() +
  geom_col(aes(x = year, y = n, 
               fill = fct_reorder2(genres, year, n)), 
           position = 'fill', 
           width = rel(1.25),
           color = 'transparent') +
  labs(y = "Share of number of movies by genre",
       fill = "Genre", x = "",
       title = "How Have Christmas Movie Genres Evolved Over Time?") +
  scale_fill_brewer(palette = "Set2") +
  hrbrthemes::scale_y_percent() +
  guides(fill = guide_legend(title.position = "top",
                             # label.position = "bottom",
                             keywidth = 3,
                             keyheight = rel(3.4),
                             ncol = 1)) +
  theme(legend.position = "right",
        legend.title = 
          element_text(face = "bold.italic",
                       size = rel(1.25),
                       hjust = .5),
        legend.text = 
          element_text(face = "italic",
                       size = rel(1.25),
                       hjust = .5),
        legend.box.margin = margin(-20, 0, 0, -20),
        panel.grid.major.x = element_blank(),
        panel.grid.major.y = element_blank(),
        axis.text.y = element_text(margin = margin(0,-20,0,0)),
        axis.title.x = element_text(face = "bold",
                                    size = rel(1.5)),
        axis.title.y = element_text(face = "bold",
                                    size = rel(1.5)),
        plot.title = element_text(
          margin = margin(0,0,20,0),
          hjust = .33
        )
        )

c.f.) The following uses geom_area() instead:

Click to Check the Answer!

df <- holiday_movies_with_genres |> 
  mutate(genres = ifelse(!(genres %in% popular_genres$genres), "Other", genres)) |> 
  group_by(year, genres) |> 
  count() |> 
  filter(year >= 1975, year <= 2022) |> 
  group_by(year) |> 
  mutate(prop = n / sum(n)) |>  # Convert count to proportion
  ungroup()

# df2 <- df |> 
#   group_by(year) |> 
#   summarise(sum = sum(prop))

df |> 
  ggplot(aes(x = year, y = prop, fill = fct_reorder2(genres, year, prop))) +
  geom_area(position = "fill") +  # Stack by genre
  labs(y = "Share of number of movies by genre",
       fill = "Genre", x = "",
       title = "How Have Christmas Movie Genres Evolved Over Time?") +
  scale_fill_brewer(palette = "Set2") +
  hrbrthemes::scale_y_percent() +
  guides(fill = guide_legend(title.position = "top",
                             # label.position = "bottom",
                             keywidth = 3,
                             keyheight = rel(3.4),
                             ncol = 1)) +
  theme(legend.position = "right",
        legend.title = element_text(face = "bold.italic",
                                    size = rel(1.25),
                                    hjust = .5),
        legend.text = element_text(face = "italic",
                                   size = rel(1.25),
                                   hjust = .5),
        legend.box.margin = margin(-20, 0, 0, -20),
        panel.grid.major.x = element_blank(),
        panel.grid.major.y = element_blank(),
        axis.text.y = element_text(margin = margin(0, -20, 0, 0)),
        axis.title.x = element_text(face = "bold",
                                    size = rel(1.5)),
        axis.title.y = element_text(face = "bold",
                                    size = rel(1.5)),
        plot.title = element_text(
          margin = margin(0,0,20,0),
          hjust = .33
        )
  )

Q4f.

Provide a comment to illustrate the annual trend of (1) the share of number of movies by popular genre from year 1975 to 2022.
- Which genre has become more popular since 2010?

Q4g.

Add the following two variables—christmas and holiday—to the data.frame holiday_movies_with_genres:
christmas:
- TRUE if the simple_title includes “christmas”, “xmas”, “x mas”
- FALSE otherwise
holiday:
- TRUE if the simple_title includes “holiday”
- FALSE otherwise

Click to Check the Answer!

holiday_movies_with_genres <- holiday_movies_with_genres |> 
  mutate(christmas = ifelse(str_detect(simple_title, "christmas") |
                              str_detect(simple_title, "xmas") |
                              str_detect(simple_title, "x mas"),
                            T, F),
         holiday = ifelse(str_detect(simple_title, "holiday"),
                            T, F),
         )

Q4h.

Provide R code to recreate the ggplot figure illustrating the annual trend of (1) the number of movie titles with “holiday” varies by christmas.

Click to Check the Answer!

holiday_movies_with_genres |> 
  group_by(year, christmas, holiday) |> 
  count() |> 
  ggplot(aes(x = year, y = n, color = holiday)) +
  geom_smooth() +
  geom_point(alpha = .33) +
  facet_wrap(christmas~., scales = "free")

Q4i.

Provide R code to recreate the ggplot figure illustrating how the mean value of num_votes varies by the popular genres for the titles with “christmas”.

Click to Check the Answer!

holiday_movies_with_genres |> 
  filter(genres %in% popular_genres$genres) |> 
  group_by(genres, christmas) |> 
  summarise(mean_rating = mean(average_rating),
            mean_votes = mean(num_votes)) |> 
  filter(christmas == T) |> 
  ggplot(aes(x = mean_votes, y = fct_reorder(genres, mean_votes),
             )) +
  geom_point(size = 2) +
  labs(y = "genres")

Question 5

The following is the data.frame for Question 5.

tripadvisor <- read_csv("https://bcdanl.github.io/data/tripadvisor_cleaned.csv")

TripAdvisor is an online travel research company that empowers people around the world to plan and enjoy the ideal trip.
TripAdvisor wanted to know whether promoting membership on their platform could drive engagement and bookings.
To do so, TripAdvisor had just run an experiment to explore user retention by offering a random subset of customers an easier sign-up process for membership.
The following is the data.frame, tripadvisor.

Variable description

id: a unique identifier for a user.
time:
- PRE if time is before the experiment;
- POST if time is in the 28 days after the experiment.
- For each id value, there are two observations—one with time == “PRE” and the other with time == “POST”.
days_visited: Number of days a user visited the TripAdvisor website.
easier_signup:
- TRUE if a user was exposed to the easier signup process (e.g., one-click signup) during the experiment;
- FALSE otherwise.
became_member:
- TRUE if a user became a member during the experiment period;
- FALSE otherwise.
locale_en_US:
- TRUE if a user accessed the website from the US;
- FALSE otherwise.
os_type: Windows, Mac, or Others
revenue_pre: Amount of dollars a user spent on the website before the experiment

Q5a.

Using the given data.frame, tripadvisor, create the data.frame, tripadvisor, for which
- time is a factor-type variable of time with the first level, “PRE”.

Click to Check the Answer!

tripadvisor <- tripadvisor |> 
  mutate(time = factor(time, levels = c("PRE", "POST")),
         os_type = factor(os_type, levels = c("Windows", "Mac", "Others") ) )

Q5b.

Provide R code to recreate the ggplot figure illustrating how the relationship between time and days_visited varies by easier_signup and became_member.
- Row-wise split is based on easier_signup.
- Column-wise split is based on became_member.

Click to Check the Answer!

ggplot(tripadvisor,
       aes(y = days_visited, 
                   x = time)) +
  geom_boxplot(aes(fill = time) ) +
  facet_grid(easier_signup ~ became_member)

Q5c.

Provide a comment to illustrate how the relationship between time and days_visited varies by easier_signup and became_member.

Q5d.

Provide a R code to create the data.frame Q5d that includes the variable diff, the difference between (1) the value of days_visited for time == PRE and (2) the value of days_visited for time == POST for each id.

Click to Check the Answer!

q2d <- tripadvisor_wide <- tripadvisor |> 
  pivot_wider(names_from = time,
              values_from = days_visited) |> 
  relocate(PRE, POST, .after = id) |> 
  mutate(diff = POST - PRE, .before = PRE) |> 
  select(id:became_member)

The resulting data.frame should look as follows:

Include only the variables as shown above.
Adjust the order of the variables as shown above.

Q5e.

Provide an R code to calculate how the difference in the mean value of diff varies by easier_signup and became_member using the data.frame Q5d.

Click to Check the Answer!

q2e <- q2d |> 
  group_by(easier_signup, became_member) |> 
  summarise(mean_diff = round(mean(diff),2))

The resulting data.frame should look as follows:

Q5f.

Using the resulting data.frame in Q5e, discuss the following question:
- What is the effect of easier signup process on the number of days a user visited the TripAdvisor website?
- How does this effect varies by the status of became_member?

Question 6

In September 2019, YouGov survey asked 1,639 GB adults the following question:

In hindsight, do you think Britain was right/wrong to vote to leave EU?

Right to leave

Wrong to leave

Don’t know

The data from the survey is in brexit.csv.

brexit <- read_csv('https://bcdanl.github.io/data/brexit.csv')

datatable(brexit)

Q6a

Replicate the following visualization

Click to Check the Answer!

brexit <- brexit |> 
  mutate(
    region = fct_relevel(region, 
                         "london", "rest_of_south", "midlands_wales", "north", "scot"),
    region = fct_recode(region, 
                        London = "london", 
                        `Rest of South` = "rest_of_south", 
                        `Midlands / Wales` = "midlands_wales", 
                        North = "north", 
                        Scotland = "scot")
  )

ggplot(brexit, 
       aes(y = opinion, fill = opinion)) +
  geom_bar() +
  facet_wrap( ~ region, 
             nrow = 1, 
             labeller = label_wrap_gen(width = 12)) +
  guides(fill = "none") +
  labs(
    title = "Was Britain right/wrong to vote to leave EU?",
    subtitle = "YouGov Survey Results, 2-3 September 2019",
    caption = "Source: bit.ly/2lCJZVg",
    x = NULL, y = NULL
  ) +
  scale_fill_manual(values = c(
    "gray",
    "#67a9cf",
    "#ef8a62"
  )) +
  theme_minimal()

Q6b

Replicate the following visualization
- How is the story this visualization telling different than the story the plot in Q6a?

Click to Check the Answer!

ggplot(brexit, 
       aes(y = opinion, fill = opinion)) +
  geom_bar() +
  facet_wrap(~region, scales = 'free_x',
    nrow = 1, labeller = label_wrap_gen(width = 12),
    # ___
  ) +
  guides(fill = "none") +
  labs(
    title = "Was Britain right/wrong to vote to leave EU?",
    subtitle = "YouGov Survey Results, 2-3 September 2019",
    caption = "Source: bit.ly/2lCJZVg",
    x = NULL, y = NULL
  ) +
  scale_fill_manual(values = c(
    "Wrong" = "#ef8a62",
    "Right" = "#67a9cf",
    "Don't know" = "gray"
  )) +
  theme_minimal()

Q6c

First, calculate the proportion of wrong, right, and don’t know answers in each region and then plot these proportions (rather than the counts) and then improve axis labeling.

Click to Check the Answer!

q6 <- brexit |> 
  group_by(region, opinion) |>  
  summarise(n = n()) |> 
  mutate(tot = sum(n),
         prop = n / tot )

Replicate the following visualization
- How is the story this visualization telling different than the story the plot in Q4b?

Click to Check the Answer!

ggplot(q6, aes(y = opinion, x = prop,
               fill = opinion)) +
  geom_col() +
  facet_wrap(~region,
    nrow = 1, labeller = label_wrap_gen(width = 12),
    # ___
  ) +
  guides(fill = "none") +
  labs(
    title = "Was Britain right/wrong to vote to leave EU?",
    subtitle = "YouGov Survey Results, 2-3 September 2019",
    caption = "Source: bit.ly/2lCJZVg",
    x = 'Percent', y = NULL
  ) +
  scale_fill_manual(values = c(
    "Wrong" = "#ef8a62",
    "Right" = "#67a9cf",
    "Don't know" = "gray"
  )) +
  scale_x_continuous(labels = scales::percent) +
  theme_minimal()

Q6d.

Recreate the same visualization from the previous exercise, this time dodging the bars for opinion proportions for each region, rather than faceting by region and then improve the legend.
- How is the story this visualization telling different than the story the previous plot tells?

Click to Check the Answer!

ggplot(q6, aes(y = region, x = prop,
               fill = opinion)) +
  geom_col(position = "dodge") +
  labs(
    title = "Was Britain right/wrong to vote to leave EU?",
    subtitle = "YouGov Survey Results, 2-3 September 2019",
    caption = "Source: bit.ly/2lCJZVg",
    x = 'Percent', y = NULL, fill = 'Opinion'
  ) +
  scale_fill_manual(values = c(
    "Wrong" = "#ef8a62",
    "Right" = "#67a9cf",
    "Don't know" = "gray"
  )) +
  scale_x_continuous(labels = scales::percent) +
  theme_minimal()

Discussion

Welcome to our Classwork 7 Discussion Board! 👋

This space is designed for you to engage with your classmates about the material covered in Classwork 7.

Whether you are looking to delve deeper into the content, share insights, or have questions about the content, this is the perfect place for you.

If you have any specific questions for Byeong-Hak (@bcdanl) regarding the Classwork 7 materials or need clarification on any points, don’t hesitate to ask here.

All comments will be stored here.

Let’s collaborate and learn from each other!