Visualization for data exploration & communication - 2

Bonwoo Koo & Subhrajit Guhathakurta

2022-09-27

library(tidyverse)
library(sf)
library(tmap)
library(leaflet)
library(here)
library(tidycensus)
# Let's prepare data
yelp <- read_rds("https://github.com/BonwooKoo/UrbanAnalytics2022/blob/main/Lab/module_1/week4/yelp_in.rds?raw=true")
# Census data
census_api_key(Sys.getenv("census_api"))

census_var <- c(hhincome = 'B19019_001',
                race.tot = "B02001_001",
                race.white = "B02001_002", 
                race.black = 'B02001_003'
                )

census <- get_acs(geography = "tract", state = "GA", county = c("Fulton", "DeKalb"),
                 output = "wide", geometry = TRUE, year = 2020,
                 variables = census_var)
  
summarise_mean <- c(str_c(names(census_var), "E"), 
                    "rating", "review_count")

census_yelp <- census %>% 
  separate(col = NAME, into=c("tract","county","state"), sep=', ') %>% 
  # Spatial join
  st_join(yelp %>% 
            mutate(n = 1,
                   price = nchar(price)) %>% 
            st_transform(crs = st_crs(census))) %>% 
  # Group_by
  group_by(GEOID, county) %>% 
  # Mean for all census variables, sum for n
  summarise(across(
    all_of(summarise_mean), mean), 
    n = sum(n),
    price = median(price)) %>% 
  # Release grouping
  ungroup() %>% 
  # Drop 'E' from column names
  rename_with(function(x) str_sub(x,1,nchar(x)-1), str_c(names(census_var), "E")) %>% # rename_with() renames with a function
  # Replace NA in column n&review_count with 0
  mutate(across(c(n, review_count), function(x) case_when(is.na(x) ~ 0, TRUE ~ x)))

Other plots

Bar chart is also very frequently used. Note that ggplot creates y-axis automatically by examining how many rows there are for each category of x. You can try yelp %>% group_by(price) %>% tally() to check the exact Y value for this plot.

ggplot(data = yelp) +
  geom_bar(mapping = aes(x=price)) 

yelp %>% 
  group_by(price) %>% 
  tally()
## Simple feature collection with 4 features and 2 fields
## Geometry type: MULTIPOINT
## Dimension:     XY
## Bounding box:  xmin: -84.74486 ymin: 33.51273 xmax: -84.07477 ymax: 34.16806
## Geodetic CRS:  WGS 84
## # A tibble: 4 × 3
##   price     n                                                           geometry
##   <chr> <int>                                                   <MULTIPOINT [°]>
## 1 $      1653 ((-84.66889 33.51819), (-84.66473 33.52469), (-84.65964 33.52801)…
## 2 $$     1850 ((-84.73536 33.51273), (-84.73738 33.52528), (-84.73587 33.52488)…
## 3 $$$     141 ((-84.74486 33.52821), (-84.54959 33.66628), (-84.44467 33.6685),…
## 4 $$$$     23 ((-84.36382 33.75925), (-84.38283 33.78418), (-84.38258 33.77639)…

We can also further break each price level by another categorical variable. We use rating to see the relative frequency of each rating for each price level. This is done by adding fill=rating in the mapping.

ggplot(data = yelp %>% 
         st_set_geometry(NULL) %>% 
         mutate(rating = round(rating,0) %>% #<<
                  factor(ordered = TRUE))) + #<<
         # delete %>% factor(ordered = T) and see what happens
  geom_bar(mapping = aes(x=price, fill=rating), position = "stack")

By changing position="stack" = position="fill", we convert the Y-axis to the proportion within each level of price and fill it up to the top. This shows more clearly how different rating levels are distributed within each price level.

ggplot(data = yelp %>% 
         st_set_geometry(NULL) %>% 
         mutate(rating = round(rating,0) %>% 
                  factor(ordered = TRUE))) + #<<
  geom_bar(mapping = aes(x=price, fill=rating), position = "fill") #<<

We want rating=5 to be on the top because (I think) it is more intuitive to see higher value on top. We can flip the bar chart vertically by adjusting the levels when we declare rating variable into a factor.

ggplot(data = yelp %>% 
         st_set_geometry(NULL) %>% 
         mutate(rating = round(rating,0) %>% 
                  factor(levels = c(5,4,3,2,1), #<<
                         ordered = TRUE))) +
  geom_bar(mapping = aes(x=price, fill=rating), position = "fill")

Sometimes we want to see the exact figures on top of the bar chart. So, let’s add the percentage within each level of price as labels.

ggplot(data = yelp %>% 
         st_set_geometry(NULL) %>% 
         mutate(rating = round(rating,0) %>% factor(levels = c(5,4,3,2,1), ordered = TRUE))) +
  geom_bar(mapping = aes(x=price, fill=rating), position = "fill") +
  geom_text(data = . %>% 
              # Grouping to calculate % by price and by rating 
              group_by(price, rating) %>% #<<
              # Count rows
              tally() %>%                 #<<
              # Convert to p
              mutate(p = n / sum(n)) %>%  #<<
              # Re-order to match the order in bar chart
              arrange(desc(rating)),      #<<
            aes(x = price, y = p, label = str_c(round(p,3)*100,"%")), color = "white",
            position = position_stack(vjust=0.5)) +
  ggdark::dark_theme_gray() # Dark theme because texts are not visible against white bg
## Inverted geom defaults of fill and color/colour.
## To change them back, use invert_geom_defaults().

You can flip it 90-degrees.

ggplot(data = yelp %>% 
         st_set_geometry(NULL) %>% 
         mutate(rating = round(rating,0) %>% factor(levels = c(5,4,3,2,1), ordered = TRUE))) +
  geom_bar(mapping = aes(x=price, fill=rating), position = "fill") +
  geom_text(data = . %>% 
              # Grouping to calculate % by price and by rating 
              group_by(price, rating) %>% #<<
              # Count rows
              tally() %>%                 #<<
              # Convert to p
              mutate(p = n / sum(n)) %>%  #<<
              # Re-order to match the order in bar chart
              arrange(desc(rating)),      #<<
            aes(x = price, y = p, label = str_c(round(p,3)*100,"%")), color = "white",
            position = position_stack(vjust=0.5)) +
  coord_flip() +
  ggdark::dark_theme_gray() # Dark theme because texts are not visible against white bg

Customization example (Optional)

I saw this beautiful example by CÉDRIC SCHERER and wanted to show you a Yelp version of the code.

# Code & ideas borrowed heavily from CÉDRIC SCHERER's personal website: 
# https://www.cedricscherer.com/2021/07/05/a-quick-how-to-on-labelling-bar-graphs-in-ggplot2/

max_city_n <- 10

rest_by_city <- yelp %>% 
  st_set_geometry(NULL) %>% 
  group_by(location.city) %>% 
  tally() %>% 
  arrange(desc(n)) %>% 
  slice(1:max_city_n) %>% 
  mutate(location.city = factor(location.city, levels = .$location.city[seq(max_city_n,1)])) %>% 
  # Format text label
  mutate(pct = scales::percent(n / sum(n),accuracy = 0.1),
         pct = case_when(row_number() == 1 ~ str_c(pct, " of all businesses"), TRUE ~ pct)) %>% 
  # Define aesthetic properties - label location
  mutate(nudge = case_when(row_number()==1 ~ 1.05, TRUE ~ -0.2)) %>% 
  # Define aesthetic properties - color
  mutate(color = case_when(row_number()==1 ~ "gray30", TRUE ~ "gray70")) %>% 
  # Color palette
  mutate(pal = c(rep('gray70', max_city_n-4), "coral2", "mediumpurple1", "mediumpurple1", "goldenrod1")) %>% 
  # with() is required to be able to call variables with referencing to data frame
  with(
    # ggplot
    ggplot(data = .) +
      # Bars
      geom_col(mapping = aes(y = location.city, x = n, fill = location.city)) +
      # Text
      geom_text(mapping = aes(y = location.city, x = n, label = pct), 
                # Calling aesthetic properties defined above
                hjust=nudge, color=color, 
                # Font styling
                fontface="bold.italic") + 
      # Stretch x axis
      scale_x_continuous(limits = c(NA, 2200)) +
      # Custom palette
      scale_fill_manual(values = pal, guide="none") +
      # Labels
      labs(x = "Count", y = "Cities", title = "Top 10 cities with most restaurants in Fulton & DeKalb Counties\n") +
      # Dark theme
      ggdark::dark_theme_classic()
  )

rest_by_city

Histogram, boxplot, violin

Histogram displays the distribution of a variable. It first assigns observations (i.e., rows) of the given variable (i.e., a column) into bins (0~99, 100~199, 200~299, etc.) and count the number of observations that fall into each bin. So, the taller a bar is, more observations fell into that bin. For example, in the histogram below, the first bar that touches 0 is much taller than other bars because most of the observations had zero or near-zero reviews.

ggplot(census_yelp) +
  geom_histogram(mapping = aes(x = review_count))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(census_yelp) +
  geom_histogram(mapping = aes(x = review_count), 
                 bins = 60) #<< increase the number of bins from 30 to 50. You get more bars.

ggplot(census_yelp) +
  geom_histogram(mapping = aes(x = review_count), 
                 bins = 60,
                 color="black")  #<< color of the outline

ggplot(data = census_yelp) +
  geom_histogram(mapping = aes(x = review_count, fill=county), #<<
                 bins = 60,
                 color="black") +
  scale_x_continuous(breaks=seq(0,1300, by=100))

ggplot(data = census_yelp) +
  geom_histogram(mapping = aes(x = review_count, fill=county), 
                 bins = 60,
                 color="black",
                 position = "identity", #<<
                 alpha = 0.2) #<<

  scale_x_continuous(breaks=seq(0,1300, by=100))
## <ScaleContinuousPosition>
##  Range:  
##  Limits:    0 --    1
ggplot(data = census_yelp) +
  geom_histogram(mapping = aes(x = review_count, fill=county), 
                 bins = 60,
                 color="black",
                 position = "dodge") + #<< #<<
  scale_x_continuous(breaks=seq(0,1300, by=100))

ggplot(data = census_yelp) +
  geom_histogram(mapping = aes(x = review_count, fill=county), 
                 bins = 60,
                 color="black",
                 position = "dodge") + #<< #<<
  scale_x_continuous(breaks=seq(0,1300, by=100)) + 
  scale_fill_manual(values=c("#999999", "#E69F00"))

Although the histogram above allows us to effectively see the distribution, comparing multiple distributions are often easier with a boxplot. You need to understand what each component of a boxplot means to read it properly. Plotly::ggplotly() provides a good interactive visualization about how to read the plot. Note that upper fence = Q3 + (1.5*IQR), where IQR is interquartile range (Q3-Q1). The lower fence is Q1 - (1.5*IQR).

bxplot <- ggplot(data = yelp) +
  geom_boxplot(aes(x=price, y=review_count),
               color="black",fill="white")

plotly::ggplotly(bxplot)
a <- ggplot(data = yelp) +
  geom_boxplot(aes(x=price, y=review_count), 
               fill = "white", color = "black")

b <- ggplot(data = yelp) +
  geom_boxplot(aes(x=review_count, y=price), 
               fill="white", color="black") 

gridExtra::grid.arrange(a, b)

Of course, you can use scale_fill_manual() to use your custom color. To do this, however, you need to have specified fill inside aes(). If it is not specified at all or is specified outside of aes(), custom color won’t work.

ggplot(data = yelp %>%
         st_join(census %>% st_transform(crs = st_crs(yelp))) %>% 
         separate(col = NAME, into=c("tract","county","state"),sep=", ") %>% 
         drop_na(county)
       ) +
  geom_boxplot(aes(x = price, y=review_count, fill = price), 
               color = "black") +
  facet_wrap(~county) +
  scale_fill_brewer(palette = "Blues")

Violin plot is yet another plot for visualizing the distribution of variables. While a boxplot allows you to see where the upper/lower fences are and where the median and quartiles are, a violin plot allows you to see the concentration of observations at a certain bins (or range if you will) of values.

vplot <- ggplot(data = yelp %>%
         st_join(census %>% st_transform(crs = st_crs(yelp))) %>% 
         separate(col = NAME, into=c("tract","county","state"),sep=", ") %>% 
         drop_na(county)
       ) +
  geom_violin(aes(x = price, y=review_count, fill = price), 
               color = "black") +
  facet_wrap(~county) +
  scale_fill_brewer(palette = "Blues")

plotly::ggplotly(vplot)

So, do restaurants in wealthy neighborhoods get higher Yelp ratings?

As we’ve seen so far, data visualization is an indispensable tool for urban analysts. If we have some time left in the class, think about socioeconomic/demographic variables in American Community Survey that may associate with review count, rating, and/or price of Yelp data. Then, get the data, plot them, and draw insights from them.

census_yelp %>% 
  mutate(review_count_cut = cut(review_count, breaks = quantile(review_count, prob = c(0,0.5,0.75,1)), include.lowest=TRUE)) %>%
  mutate(pct_white = race.white / race.tot) %>% 
  ggplot(data = ., aes(x = hhincome, y = rating)) +
  geom_point(mapping = aes(color = review_count_cut)) +
  scale_color_manual(values = c("gray50", "orange", "red"), labels = c("0 - 50th", "50th- 75th", "75th - 100th")) +
  labs(x = "Annual Household Income", y = "Yelp Rating", color = "Review County (discrete)", title = "Household Income vs. Rating") +
  ggdark::dark_theme_gray() +
  
  # ------------------------------------------------------------------
  # This line of code adds the correlation analysis result to the plot
  ggpubr::stat_cor(method = "pearson", label.x = 160000, label.y = 1.5)
## Warning: Removed 139 rows containing non-finite values (stat_cor).
## Warning: Removed 139 rows containing missing values (geom_point).

census_yelp %>% 
  mutate(review_count_cut = cut(review_count, breaks = quantile(review_count, prob = c(0,0.5,0.75,1)), include.lowest=TRUE)) %>%
  mutate(pct_white = race.white / race.tot) %>% 
  ggplot(data = ., aes(x = pct_white, y = rating)) +
  geom_point(mapping = aes(color = review_count_cut)) +
  scale_color_manual(values = c("gray50", "orange", "red"), labels = c("0 - 50th", "50th- 75th", "75th - 100th")) +
  labs(x = "Proportion of White Residents", y = "Yelp Rating", color = "Review County (discrete)", title = "Proportion of White Residents vs. Rating") +
  ggdark::dark_theme_gray() + 
  
  # ------------------------------------------------------------------
  # This line of code adds the correlation analysis result to the plot
  ggpubr::stat_cor(method = "pearson", label.x = 0.7, label.y = 1.5) 
## Warning: Removed 135 rows containing non-finite values (stat_cor).
## Warning: Removed 135 rows containing missing values (geom_point).