class: center, middle, inverse, title-slide .title[ # Fundamentals of R: Data Visualization ] .subtitle[ ##
STA35A: Statistical Data Science 1 ] .author[ ### Xiao Hui Tai ] .date[ ### October 13, 2023 ] --- layout: true <!-- <div class="my-footer"> --> <!-- <span> --> <!-- <a href="https://datasciencebox.org" target="_blank">datasciencebox.org</a> --> <!-- </span> --> <!-- </div> --> --- <style type="text/css"> .tiny .remark-code { font-size: 60%; } .small .remark-code { font-size: 80%; } </style> ## Recap -- - Data visualization using `ggplot2` - `ggplot2` vs. base R - `ggplot()` is the main function in the `ggplot2` package - Plots are constructed in layers - Basic structure of `ggplot()` code ```r ggplot(data = [dataset], mapping = aes(x = [x-variable], y = [y-variable])) + geom_xxx() + other options ``` - Data, mapping, geom, labels --- ## Today - More on `ggplot()` - Data, mapping, geom, labels - Mapping vs. setting - Faceting --- ## Back to Palmer Penguins example Data contains information on 344 penguins, including: penguin species, island in Palmer Archipelago, size (flipper length, body mass, bill dimensions), and sex. <img src="img/penguins.png" width="40%" style="display: block; margin: auto;" /> --- ```r library(palmerpenguins) dplyr::glimpse(penguins) ``` ``` ## Rows: 344 ## Columns: 8 ## $ species <fct> Adelie, Adelie, Adelie, Adelie, Adeli… ## $ island <fct> Torgersen, Torgersen, Torgersen, Torg… ## $ bill_length_mm <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.… ## $ bill_depth_mm <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.… ## $ flipper_length_mm <int> 181, 186, 195, NA, 193, 190, 181, 195… ## $ body_mass_g <int> 3750, 3800, 3250, NA, 3450, 3650, 362… ## $ sex <fct> male, female, female, NA, female, mal… ## $ year <int> 2007, 2007, 2007, 2007, 2007, 2007, 2… ``` --- .panelset[ .panel[.panel-name[Plot] <img src="lecture8_files/figure-html/unnamed-chunk-6-1.png" width="70%" style="display: block; margin: auto;" /> ] .panel[.panel-name[Code] .small[ ```r ggplot(data = penguins, mapping = aes(x = bill_depth_mm, y = bill_length_mm, color = species)) + geom_point() + labs(title = "Bill depth and length", subtitle = "Dimensions for Adelie, Chinstrap, and Gentoo Penguins", x = "Bill depth (mm)", y = "Bill length (mm)", color = "Species", caption = "Source: Palmer Station LTER / palmerpenguins package") + scale_color_viridis_d() ``` ``` ## Warning: Removed 2 rows containing missing values (geom_point). ``` ] ] ] --- .midi[ > **Start with the `penguins` data frame** ] .tiny[ .pull-left[ ```r ggplot(data = penguins) #<< ``` ] ] .pull-right[ <img src="lecture8_files/figure-html/unnamed-chunk-7-1.png" width="100%" style="display: block; margin: auto;" /> ] --- .midi[ > Start with the `penguins` data frame, > **map bill depth to the x-axis** ] .tiny[ .pull-left[ ```r ggplot(data = penguins, mapping = aes(x = bill_depth_mm)) #<< ``` ] ] .pull-right[ <img src="lecture8_files/figure-html/unnamed-chunk-8-1.png" width="100%" style="display: block; margin: auto;" /> ] --- .midi[ > Start with the `penguins` data frame, > map bill depth to the x-axis > **and map bill length to the y-axis.** ] .tiny[ .pull-left[ ```r ggplot(data = penguins, mapping = aes(x = bill_depth_mm, y = bill_length_mm)) #<< ``` ] ] .pull-right[ <img src="lecture8_files/figure-html/unnamed-chunk-9-1.png" width="100%" style="display: block; margin: auto;" /> ] --- .midi[ > Start with the `penguins` data frame, > map bill depth to the x-axis > and map bill length to the y-axis. > **Represent each observation with a point** ] .tiny[ .pull-left[ ```r ggplot(data = penguins, mapping = aes(x = bill_depth_mm, y = bill_length_mm)) + geom_point() #<< ``` ] ] .pull-right[ <img src="lecture8_files/figure-html/unnamed-chunk-10-1.png" width="100%" style="display: block; margin: auto;" /> ] --- .midi[ > Start with the `penguins` data frame, > map bill depth to the x-axis > and map bill length to the y-axis. > Represent each observation with a point > **and map species to the color of each point.** ] .tiny[ .pull-left[ ```r ggplot(data = penguins, mapping = aes(x = bill_depth_mm, y = bill_length_mm, color = species)) + #<< geom_point() ``` ] ] .pull-right[ <img src="lecture8_files/figure-html/unnamed-chunk-11-1.png" width="100%" style="display: block; margin: auto;" /> ] --- .midi[ > Start with the `penguins` data frame, > map bill depth to the x-axis > and map bill length to the y-axis. > Represent each observation with a point > and map species to the color of each point. > **Title the plot "Bill depth and length"** ] .tiny[ .pull-left[ ```r ggplot(data = penguins, mapping = aes(x = bill_depth_mm, y = bill_length_mm, color = species)) + geom_point() + labs(title = "Bill depth and length") #<< ``` ] ] .pull-right[ <img src="lecture8_files/figure-html/unnamed-chunk-12-1.png" width="100%" style="display: block; margin: auto;" /> ] --- .midi[ > Start with the `penguins` data frame, > map bill depth to the x-axis > and map bill length to the y-axis. > Represent each observation with a point > and map species to the color of each point. > Title the plot "Bill depth and length", > **add the subtitle "Dimensions for Adelie, Chinstrap, and Gentoo Penguins"** ] .tiny[ .pull-left[ ```r ggplot(data = penguins, mapping = aes(x = bill_depth_mm, y = bill_length_mm, color = species)) + geom_point() + labs(title = "Bill depth and length", subtitle = "Dimensions for Adelie, Chinstrap, and Gentoo Penguins") #<< ``` ] ] .pull-right[ <img src="lecture8_files/figure-html/unnamed-chunk-13-1.png" width="100%" style="display: block; margin: auto;" /> ] --- .midi[ > Start with the `penguins` data frame, > map bill depth to the x-axis > and map bill length to the y-axis. > Represent each observation with a point > and map species to the color of each point. > Title the plot "Bill depth and length", > add the subtitle "Dimensions for Adelie, Chinstrap, and Gentoo Penguins", > **label the x and y axes as "Bill depth (mm)" and "Bill length (mm)", respectively** ] .tiny[ .pull-left[ ```r ggplot(data = penguins, mapping = aes(x = bill_depth_mm, y = bill_length_mm, color = species)) + geom_point() + labs(title = "Bill depth and length", subtitle = "Dimensions for Adelie, Chinstrap, and Gentoo Penguins", x = "Bill depth (mm)", y = "Bill length (mm)") #<< ``` ] ] .pull-right[ <img src="lecture8_files/figure-html/unnamed-chunk-14-1.png" width="100%" style="display: block; margin: auto;" /> ] --- .midi[ > Start with the `penguins` data frame, > map bill depth to the x-axis > and map bill length to the y-axis. > Represent each observation with a point > and map species to the color of each point. > Title the plot "Bill depth and length", > add the subtitle "Dimensions for Adelie, Chinstrap, and Gentoo Penguins", > label the x and y axes as "Bill depth (mm)" and "Bill length (mm)", respectively, > **label the legend "Species"** ] .tiny[ .pull-left[ ```r ggplot(data = penguins, mapping = aes(x = bill_depth_mm, y = bill_length_mm, color = species)) + geom_point() + labs(title = "Bill depth and length", subtitle = "Dimensions for Adelie, Chinstrap, and Gentoo Penguins", x = "Bill depth (mm)", y = "Bill length (mm)", color = "Species") #<< ``` ] ] .pull-right[ <img src="lecture8_files/figure-html/unnamed-chunk-15-1.png" width="100%" style="display: block; margin: auto;" /> ] --- .midi[ > Start with the `penguins` data frame, > map bill depth to the x-axis > and map bill length to the y-axis. > Represent each observation with a point > and map species to the color of each point. > Title the plot "Bill depth and length", > add the subtitle "Dimensions for Adelie, Chinstrap, and Gentoo Penguins", > label the x and y axes as "Bill depth (mm)" and "Bill length (mm)", respectively, > label the legend "Species", > **and add a caption for the data source.** ] .tiny[ .pull-left[ ```r ggplot(data = penguins, mapping = aes(x = bill_depth_mm, y = bill_length_mm, color = species)) + geom_point() + labs(title = "Bill depth and length", subtitle = "Dimensions for Adelie, Chinstrap, and Gentoo Penguins", x = "Bill depth (mm)", y = "Bill length (mm)", color = "Species", caption = "Source: Palmer Station LTER / palmerpenguins package") #<< ``` ] ] .pull-right[ <img src="lecture8_files/figure-html/unnamed-chunk-16-1.png" width="100%" style="display: block; margin: auto;" /> ] --- .midi[ > Start with the `penguins` data frame, > map bill depth to the x-axis > and map bill length to the y-axis. > Represent each observation with a point > and map species to the color of each point. > Title the plot "Bill depth and length", > add the subtitle "Dimensions for Adelie, Chinstrap, and Gentoo Penguins", > label the x and y axes as "Bill depth (mm)" and "Bill length (mm)", respectively, > label the legend "Species", > and add a caption for the data source. > **Finally, use a discrete color scale that is designed to be perceived by viewers with common forms of color blindness.** ] .tiny[ .pull-left[ ```r ggplot(data = penguins, mapping = aes(x = bill_depth_mm, y = bill_length_mm, color = species)) + geom_point() + labs(title = "Bill depth and length", subtitle = "Dimensions for Adelie, Chinstrap, and Gentoo Penguins", x = "Bill depth (mm)", y = "Bill length (mm)", color = "Species", caption = "Source: Palmer Station LTER / palmerpenguins package") + scale_color_viridis_d() #<< ``` ] ] .pull-right[ <img src="lecture8_files/figure-html/unnamed-chunk-17-1.png" width="100%" style="display: block; margin: auto;" /> ] --- .panelset[ .panel[.panel-name[Plot] <img src="lecture8_files/figure-html/unnamed-chunk-18-1.png" width="70%" style="display: block; margin: auto;" /> ] .panel[.panel-name[Code] .small[ ```r ggplot(data = penguins, mapping = aes(x = bill_depth_mm, y = bill_length_mm, color = species)) + geom_point() + labs(title = "Bill depth and length", subtitle = "Dimensions for Adelie, Chinstrap, and Gentoo Penguins", x = "Bill depth (mm)", y = "Bill length (mm)", color = "Species", caption = "Source: Palmer Station LTER / palmerpenguins package") + scale_color_viridis_d() ``` ``` ## Warning: Removed 2 rows containing missing values (geom_point). ``` ] ] .panel[.panel-name[Narrative] .pull-left-wide[ .midi[ Start with the `penguins` data frame, map bill depth to the x-axis and map bill length to the y-axis. Represent each observation with a point and map species to the color of each point. Title the plot "Bill depth and length", add the subtitle "Dimensions for Adelie, Chinstrap, and Gentoo Penguins", label the x and y axes as "Bill depth (mm)" and "Bill length (mm)", respectively, label the legend "Species", and add a caption for the data source. Finally, use a discrete color scale that is designed to be perceived by viewers with common forms of color blindness. ] ] ] ] --- ## Argument names We can omit the names of first two arguments when building plots with `ggplot()`. .tiny[ .pull-left[ ```r ggplot(data = penguins, mapping = aes(x = bill_depth_mm, y = bill_length_mm, color = species)) + geom_point() + scale_color_viridis_d() ``` ] ] .tiny[ .pull-right[ ```r ggplot(penguins, aes(x = bill_depth_mm, y = bill_length_mm, color = species)) + geom_point() + scale_color_viridis_d() ``` ] ] <br><br> <br><br><br><br>You will also often see code that omits the data argument using a pipe, and adds lines for data manipulation: ```r penguins %>% # other data manipulation functions ggplot(aes(x = bill_depth_mm, y = bill_length_mm, color = species)) + geom_point() + scale_color_viridis_d() ``` --- ## Aesthetics options - Variables in the data are **mapped** to visual properties (aesthetics) of geoms. - Commonly used aesthetics are - `color` - `shape` - `size` - `alpha` (transparency) --- ## Color .tiny[ .pull-left[ ```r ggplot(penguins, aes(x = bill_depth_mm, y = bill_length_mm, color = species)) + #<< geom_point() + scale_color_viridis_d() ``` ] ] .pull-right[ <img src="lecture8_files/figure-html/unnamed-chunk-20-1.png" width="100%" style="display: block; margin: auto;" /> ] --- ## Shape Different variables mapped to `color` and `shape` .tiny[ .pull-left[ ```r ggplot(penguins, aes(x = bill_depth_mm, y = bill_length_mm, color = species, shape = island)) + #<< geom_point() + scale_color_viridis_d() ``` ] ] .pull-right[ <img src="lecture8_files/figure-html/unnamed-chunk-21-1.png" width="100%" style="display: block; margin: auto;" /> ] --- ## Shape Same variable matched to `color` and `shape` .tiny[ .pull-left[ ```r ggplot(penguins, aes(x = bill_depth_mm, y = bill_length_mm, color = species, shape = species)) + #<< geom_point() + scale_color_viridis_d() ``` ] ] .pull-right[ <img src="lecture8_files/figure-html/unnamed-chunk-22-1.png" width="100%" style="display: block; margin: auto;" /> ] --- ## Size .tiny[ .pull-left[ ```r ggplot(penguins, aes(x = bill_depth_mm, y = bill_length_mm, color = species, shape = species, size = body_mass_g)) + #<< geom_point() + scale_color_viridis_d() ``` ] ] .pull-right[ <img src="lecture8_files/figure-html/unnamed-chunk-23-1.png" width="100%" style="display: block; margin: auto;" /> ] --- ## Alpha .tiny[ .pull-left[ ```r ggplot(penguins, aes(x = bill_depth_mm, y = bill_length_mm, color = species, shape = species, size = body_mass_g, alpha = flipper_length_mm)) + #<< geom_point() + scale_color_viridis_d() ``` ] ] .pull-right[ <img src="lecture8_files/figure-html/unnamed-chunk-24-1.png" width="100%" style="display: block; margin: auto;" /> ] --- ## Mapping vs. setting .pull-left[ **Mapping** .tiny[ ```r ggplot(penguins, aes(x = bill_depth_mm, y = bill_length_mm, size = body_mass_g, #<< alpha = flipper_length_mm)) + #<< geom_point() ``` <img src="lecture8_files/figure-html/unnamed-chunk-25-1.png" width="100%" style="display: block; margin: auto;" /> ] ] .pull-right[ **Setting** .tiny[ ```r ggplot(penguins, aes(x = bill_depth_mm, y = bill_length_mm)) + geom_point(size = 2, alpha = 0.5) #<< ``` <img src="lecture8_files/figure-html/unnamed-chunk-26-1.png" width="100%" style="display: block; margin: auto;" /> ] ] --- ## Mapping vs. setting - **Mapping:** Determine the size, alpha, etc. of points based on the values of a variable in the data - Goes into `aes()` - **Setting:** Determine the size, alpha, etc. of points **not** based on the values of a variable in the data - Goes into `geom_*()` <small>this was `geom_point()` in the previous example, but we'll learn about other geoms soon</small> --- ## Faceting - Smaller plots that display different subsets of the data - Useful for exploring conditional relationships and large data .panelset[ .panel[.panel-name[Plot] <img src="lecture8_files/figure-html/unnamed-chunk-27-1.png" width="70%" style="display: block; margin: auto;" /> ] .panel[.panel-name[Code] ```r ggplot(penguins, aes(x = bill_depth_mm, y = bill_length_mm)) + geom_point() + facet_grid(species ~ island) #<< ``` ``` ## Warning: Removed 2 rows containing missing values (geom_point). ``` ] ] --- ## `facet_wrap()` with one variable - `facet_wrap()`: creates a 1d ribbon wrapped according to number of rows and columns specified or available plotting area ```r ggplot(penguins, aes(x = bill_depth_mm, y = bill_length_mm)) + geom_point() + facet_wrap(~ species) #<< ``` <img src="lecture8_files/figure-html/unnamed-chunk-28-1.png" width="60%" style="display: block; margin: auto;" /> --- ## `facet_wrap()` with one variable - Specifying 2 columns ```r ggplot(penguins, aes(x = bill_depth_mm, y = bill_length_mm)) + geom_point() + facet_wrap(~ species, ncol = 2) #<< ``` <img src="lecture8_files/figure-html/unnamed-chunk-29-1.png" width="60%" style="display: block; margin: auto;" /> --- ## `facet_wrap()` with two variables ```r ggplot(penguins, aes(x = bill_depth_mm, y = bill_length_mm)) + geom_point() + facet_wrap(~ species + sex) #<< ``` <img src="lecture8_files/figure-html/unnamed-chunk-30-1.png" width="60%" style="display: block; margin: auto;" /> --- ## 2D grid using `facet_grid()` - `facet_grid()`: - 2D grid - `rows ~ cols` - use `.` for no split (1D) - Uses all levels, even if there are no observations; i.e., may produce empty plots - `facet_wrap()` only displays the plots having actual values --- ## 2D grid using `facet_grid()` ```r ggplot(penguins, aes(x = bill_depth_mm, y = bill_length_mm)) + geom_point() + facet_grid(species ~ sex) #<< ``` <img src="lecture8_files/figure-html/unnamed-chunk-31-1.png" width="60%" style="display: block; margin: auto;" /> --- ## 2D grid using `facet_grid()` ```r ggplot(penguins, aes(x = bill_depth_mm, y = bill_length_mm)) + geom_point() + facet_grid(sex ~ species) #<< ``` <img src="lecture8_files/figure-html/unnamed-chunk-32-1.png" width="60%" style="display: block; margin: auto;" /> --- ## Going back to the UN Votes example: <img src="lecture8_files/figure-html/unnamed-chunk-33-1.png" width="100%" style="display: block; margin: auto;" /> --- .small[ ```r un_votes %>% filter(country %in% c("United States", "United Kingdom", "China", "Singapore")) %>% inner_join(un_roll_calls, by = "rcid") %>% inner_join(un_roll_call_issues, by = "rcid") %>% mutate(year = lubridate::year(date)) %>% group_by(country, year, issue) %>% summarize(votes = n(), percent_yes = mean(vote == "yes")) %>% filter(votes > 5) %>% # Only use records where there are more than 5 votes ggplot(mapping = aes(x = year, y = percent_yes, color = country)) + geom_point(alpha = 0.4) + geom_smooth(method = "loess", se = FALSE) + facet_wrap(~ issue) + scale_y_continuous(labels = scales::percent) + labs( title = "Percentage of 'Yes' votes in the UN General Assembly", subtitle = "1946 to 2019", y = "% Yes", x = "Year", color = "Country" ) + scale_color_viridis_d() + theme(text = element_text(size = 9)) ``` ] How can we reduce the size of all points? What if we just want to group by country? --- <img src="lecture8_files/figure-html/unnamed-chunk-35-1.png" width="100%" style="display: block; margin: auto;" /> --- .small[ ```r un_votes %>% filter(country %in% c("United States", "United Kingdom", "China", "Singapore")) %>% inner_join(un_roll_calls, by = "rcid") %>% inner_join(un_roll_call_issues, by = "rcid") %>% mutate(year = lubridate::year(date)) %>% group_by(country, year, issue) %>% summarize(votes = n(), percent_yes = mean(vote == "yes")) %>% filter(votes > 5) %>% # Only use records where there are more than 5 votes ggplot(mapping = aes(x = year, y = percent_yes, color = country)) + geom_point(alpha = 0.4, size = 0.5) + geom_smooth(method = "loess", se = FALSE) + facet_wrap(~ issue) + scale_y_continuous(labels = scales::percent) + labs( title = "Percentage of 'Yes' votes in the UN General Assembly", subtitle = "1946 to 2019", y = "% Yes", x = "Year", color = "Country" ) + scale_color_viridis_d() + theme(text = element_text(size = 9)) ``` ] --- <img src="lecture8_files/figure-html/unnamed-chunk-37-1.png" width="100%" style="display: block; margin: auto;" /> --- .small[ ```r un_votes %>% filter(country %in% c("United States", "United Kingdom", "China", "Singapore")) %>% inner_join(un_roll_calls, by = "rcid") %>% inner_join(un_roll_call_issues, by = "rcid") %>% mutate(year = lubridate::year(date)) %>% group_by(country, year) %>% summarize(votes = n(), percent_yes = mean(vote == "yes")) %>% filter(votes > 5) %>% # Only use records where there are more than 5 votes ggplot(mapping = aes(x = year, y = percent_yes, color = country)) + geom_point(alpha = 0.4) + geom_smooth(method = "loess", se = FALSE) + # facet_wrap(~ issue) + scale_y_continuous(labels = scales::percent) + labs( title = "Percentage of 'Yes' votes in the UN General Assembly", subtitle = "1946 to 2019", y = "% Yes", x = "Year", color = "Country" ) + scale_color_viridis_d() + theme(text = element_text(size = 9)) ``` ] --- ## Summary -- - More on `ggplot()` - Data, mapping, geom, labels - Mapping vs. setting - Faceting