Visualization for data exploration & communication

Bonwoo Koo & Subhrajit Guhathakurta

2022-09-27

Why visualization?

Data visualization is an essential tool for urban analytics. It offers an intuitive and quick way to understand the characteristics and relationships embedded in your data. Sometimes a good data visualization can deliver a message more effectively than any other medium and exercise transformative power that shapes people’s perception see this example. And not to mention that cool visualizations are cool.

The first thing we analysts do when we get our hands on a dataset is to understand it. The best way to understand any given data is to visualize it. We have a wide variety of maps and charts to choose from, including scatterplots, histograms, boxplots, violin plots, and mapping. We have been doing mapping a lot, so this document will focus more on other charts.

library(tidyverse)
library(sf)
library(tmap)
library(leaflet)
library(here)
library(tidycensus)
# Let's prepare data
yelp <- read_rds("https://github.com/BonwooKoo/UrbanAnalytics2022/blob/main/Lab/module_1/week4/yelp_in.rds?raw=true")
# Census data
census_api_key(Sys.getenv("census_api"))

census_var <- c(hhincome = 'B19019_001',
                race.tot = "B02001_001",
                race.white = "B02001_002", 
                race.black = 'B02001_003'
                )

census <- get_acs(geography = "tract", state = "GA", county = c("Fulton", "DeKalb"),
                 output = "wide", geometry = TRUE, year = 2020,
                 variables = census_var)
  
summarise_mean <- c(str_c(names(census_var), "E"), 
                    "rating", "review_count")

census_yelp <- census %>% 
  separate(col = NAME, into=c("tract","county","state"), sep=', ') %>% 
  # Spatial join
  st_join(yelp %>% 
            mutate(n = 1,
                   price = nchar(price)) %>% 
            st_transform(crs = st_crs(census))) %>% 
  # Group_by
  group_by(GEOID, county) %>% 
  # Mean for all census variables, sum for n
  summarise(across(
    all_of(summarise_mean), mean), 
    n = sum(n),
    price = median(price)) %>% 
  # Release grouping
  ungroup() %>% 
  # Drop 'E' from column names
  rename_with(function(x) str_sub(x,1,nchar(x)-1), str_c(names(census_var), "E")) %>% # rename_with() renames with a function
  # Replace NA in column n&review_count with 0
  mutate(across(c(n, review_count), function(x) case_when(is.na(x) ~ 0, TRUE ~ x)))

As usual, using tmap to visualize the data.

tmap_mode("view");
## tmap mode set to interactive viewing
a <- tm_shape(census_yelp) + 
  tm_polygons(col = "review_count", style = "quantile")

b <- tm_shape(yelp) +
  tm_dots(col = "review_count", style="quantile")

tmap_arrange(a,b, sync = TRUE)

Or we can use leaflet() package for mapping.

 library(leaflet)
 library(htmlwidgets)
 library(htmltools)

# CSS for title
tag.map.title <- tags$style(HTML("
  .leaflet-control.map_title { 
    position: absolute;
    left: 50px;
    width: 320px;
    text-align: left;
    color: white;
    padding-left: 10px; 
    background: rgba(200,200,200,0.2);
    font-weight: bold;
    font-size: 20px;
    font-family: Helvetica;
    border-color: white;
    border-radius: 10px;
  }"))

# Format title
title <- tags$div(
  class="map_title", tag.map.title, HTML("<p>Restaurants in Fulton and DeKalb Counties from Yelp</p>")
)  

# Color palette
fill_pal <- colorQuantile(palette = "YlOrRd", domain=yelp$review_count)

# Label for mouseover & popup
yelp_labels <- paste(
  "<a href=",yelp$url,">",yelp$name, "</a><br>",
  "<strong>Review Count: </strong>", yelp$review_count,"<br>",
  "<strong>Rating: </strong>", yelp$rating) %>%
  lapply(htmltools::HTML)

# Creating a Leaflet widget
leaflet() %>% 
  # Setting the view on load
  setView(lng = -84.3903996350635, lat = 33.77074368998939, zoom = 11) %>% 
  
  # Dark base map
  addProviderTiles(providers$CartoDB.DarkMatterNoLabels) %>% 
  
  # Polygon boundary
  addPolygons(data = census %>% st_union(), 
              opacity=0.2,
              fillOpacity=0,
              weight=1,
              color="white") %>% 
  
  # Yelp point
  addCircleMarkers(data = yelp, 
             radius = yelp$rating*1.5, 
             opacity=0.2,
             fillColor=~fill_pal(review_count),
             weight=1,
             color= ~fill_pal(review_count),
             popup= ~yelp_labels,
             label= ~yelp_labels) %>% 
  
  # Legend
  addLegend("bottomright", pal = fill_pal, values = yelp$review_count,title = "Review Count",opacity = 1) %>% 
  
  # Title
  addControl(title, position="topleft",className="map_title")
## Warning: sf layer has inconsistent datum (+proj=longlat +datum=NAD83 +no_defs).
## Need '+proj=longlat +datum=WGS84'



For static maps with a lot of customizeability - ggplot2 can also create maps. See this example for an inspiration.

Creating a ggplot

In tmap package, two functions always go hand-in-hand, namely tm_shape() and tm_polygons() (or tm_lines, tm_dots, etc.). The tm_shape() function declares the data object to be displayed. Then, the tm_polygons() function defines the geometry shape and other associated characteristics.

A similar structure is used in ggplot2 package. Creating a plot needs at least two functions that are connected by +: ggplot() function and geom_point() (or other geometry types, such as geom_line, geom_boxplot, etc.). In the example below, ggplot(data = yelp) indicates that we are drawing ggplot using yelp data. Then, geom_point(aes(x = review_count, y = rating)) shows that we are going to draw a scatterplot using review_count and rating columns.

ggplot(data = census_yelp) +
  geom_point(mapping = aes(x = review_count, y = rating))
## Warning: Removed 134 rows containing missing values (geom_point).

We can add additional information to this plot using a few different strategies, including colors, sizes, and shapes inside the aes() part of the code. For example, we can add price information using color.

ggplot(data = census_yelp) +
  geom_point(mapping = aes(x=review_count, y=rating, 
                           color=price)) #<<
## Warning: Removed 134 rows containing missing values (geom_point).

ggplot(data = census_yelp) +
  geom_point(mapping = aes(x=review_count, y=rating, 
                          size=price)) #<<
## Warning: Removed 134 rows containing missing values (geom_point).

fig1 <- ggplot(data = census_yelp) +
  geom_point(mapping = aes(x=review_count, y=rating, 
                           alpha=price)) #<<

fig2 <- ggplot(data = census_yelp) +
  geom_point(mapping = aes(x=review_count, y=rating, 
                           alpha=price, #<<
                           size=price)) #<<

gridExtra::grid.arrange(fig1, fig2, ncol= 1)
## Warning: Removed 134 rows containing missing values (geom_point).
## Removed 134 rows containing missing values (geom_point).

If you put, for example, color and size arguments outside of aes(), the visual property from those arguments are not mapped to your data; it is applied direct to the plot.

fig2 <- ggplot(data = census_yelp) +
  geom_point(mapping = aes(x=review_count, y=rating), 
             color = "orange") #<<

fig3 <- ggplot(data = census_yelp) +
  geom_point(mapping = aes(x=review_count, y=rating), 
             size = 5) #<<

gridExtra::grid.arrange(fig2, fig3, ncol = 1)
## Warning: Removed 134 rows containing missing values (geom_point).
## Removed 134 rows containing missing values (geom_point).

Remember that if you use facet_wrap with a continuous data, it will generate as many plots as the unique values in the continuous data. Avoid writing such codes!

ggplot(data = census_yelp) +
  geom_point(mapping = aes(x=review_count, y=rating)) +
  facet_wrap(~county) #<<
## Warning: Removed 134 rows containing missing values (geom_point).

ggplot(data = census_yelp) +
  geom_point(mapping = aes(x=review_count, y=rating)) +
  facet_grid(price~county) #<<
## Warning: Removed 134 rows containing missing values (geom_point).

# price will be rows; county will be columns.
# try swapping price and county
ggplot(data = census_yelp) +
  geom_smooth(mapping = aes(x=review_count, y=rating), method = "lm")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 134 rows containing non-finite values (stat_smooth).

# More than one layers
ggplot(data = census_yelp) +
  geom_point(mapping = aes(x=review_count, y=rating)) + 
  geom_smooth(mapping = aes(x=review_count, y=rating), method = "lm")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 134 rows containing non-finite values (stat_smooth).
## Warning: Removed 134 rows containing missing values (geom_point).

In the code above, we are repeating aes(x=review_count, y=rating) twice. If we know that mapping will be the same in multiple layers, we can define it in ggplot().

ggplot(data = census_yelp, mapping = aes(x=review_count, y=rating)) + #<<
  geom_point() + 
  geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 134 rows containing non-finite values (stat_smooth).
## Warning: Removed 134 rows containing missing values (geom_point).

If you want to add a specific mapping to a layer, you provide additional mapping to individual layers.

ggplot(data = census_yelp, mapping = aes(x=review_count, y=rating)) +
  geom_point() + 
  geom_smooth(mapping = aes(color = county), #<<
              method = "lm")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 134 rows containing non-finite values (stat_smooth).
## Warning: Removed 134 rows containing missing values (geom_point).

You can append labs() to specify labels.

ggplot(data = census_yelp, mapping = aes(x=review_count, y=rating)) +
  geom_point() + 
  geom_smooth(mapping = aes(color = county), method = "lm") +
  labs(x = "Review Count in Yelp", #<<
       y = "Rating in Yelp",
       color = "County in Census",
       title = "Do better rated restaurants have more reviews?")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 134 rows containing non-finite values (stat_smooth).
## Warning: Removed 134 rows containing missing values (geom_point).

Aesthetic options

We can change the overall theme of the plot using theme_<...>.

ggplot(data = census_yelp) +
  geom_point(mapping = aes(x=review_count, y=rating, color = county)) + 
  labs(x = "Review Count in Yelp",
       y = "Rating in Yelp",
       color = "County in Census",
       title = "Do better rated restaurants have more reviews?") +
  theme_bw()
## Warning: Removed 134 rows containing missing values (geom_point).

Of course, dark is always cooler.

ggplot(data = census_yelp) +
  geom_point(mapping = aes(x=review_count, y=rating, color = county)) + 
  labs(x = "Review Count in Yelp",
       y = "Rating in Yelp",
       color = "County in Census",
       title = "Do better rated restaurants have more reviews?") +
  ggdark::dark_theme_gray()
## Inverted geom defaults of fill and color/colour.
## To change them back, use invert_geom_defaults().
## Warning: Removed 134 rows containing missing values (geom_point).

If you want to use your custom color choices - for a discrete variable.

ggplot(data = census_yelp) +
  geom_point(mapping = aes(x=review_count, y=rating, color = county)) + 
  labs(x = "Review Count in Yelp",
       y = "Rating in Yelp",
       color = "County in Census",
       title = "Do better rated restaurants have more reviews?") +
  scale_color_manual(values = c("green", "darkblue")) + #<<
  theme_bw()
## Warning: Removed 134 rows containing missing values (geom_point).

If you want to use your custom color choices - for a continuous variable.

ggplot(data = census_yelp) +
  geom_point(mapping = aes(x=review_count, y=rating, color = hhincome)) + 
  labs(x = "Review Count in Yelp",
       y = "Rating in Yelp",
       color = "Annual Household Income",
       title = "Do better rated restaurants have more reviews?") +
  scale_color_gradient(low="darkblue", high="red") + #<<
  theme_bw()
## Warning: Removed 134 rows containing missing values (geom_point).

outliers <- census_yelp %>% 
  arrange(desc(review_count)) %>% 
  slice(1:4)

ggplot(data = census_yelp,
       aes(x=review_count, y=rating)) + # moved aes() to here
  geom_point(mapping = aes(color = hhincome)) + # Colored ones
  geom_point(data = outliers, size = 3, shape = 1, color = "black") + # Black circles
  ggrepel::geom_label_repel(data = outliers, mapping = aes(label = county)) + 
  labs(x = "Review Count in Yelp",
       y = "Rating in Yelp",
       color = "Annual Household Income",
       title = "Do better rated restaurants have more reviews?") +
  scale_color_gradient(low="darkblue", high="red") + #<<
  theme_light()
## Warning: Removed 134 rows containing missing values (geom_point).

Other plots

Bar chart is also very frequently used. Note that ggplot creates y-axis automatically by examining how many rows there are for each category of x. You can try yelp %>% group_by(price) %>% tally() to check the exact Y value for this plot.

ggplot(data = yelp) +
  geom_bar(mapping = aes(x=price)) 

yelp %>% 
  group_by(price) %>% 
  tally()
## Simple feature collection with 4 features and 2 fields
## Geometry type: MULTIPOINT
## Dimension:     XY
## Bounding box:  xmin: -84.74486 ymin: 33.51273 xmax: -84.07477 ymax: 34.16806
## Geodetic CRS:  WGS 84
## # A tibble: 4 × 3
##   price     n                                                           geometry
##   <chr> <int>                                                   <MULTIPOINT [°]>
## 1 $      1653 ((-84.66889 33.51819), (-84.66473 33.52469), (-84.65964 33.52801)…
## 2 $$     1850 ((-84.73536 33.51273), (-84.73738 33.52528), (-84.73587 33.52488)…
## 3 $$$     141 ((-84.74486 33.52821), (-84.54959 33.66628), (-84.44467 33.6685),…
## 4 $$$$     23 ((-84.36382 33.75925), (-84.38283 33.78418), (-84.38258 33.77639)…

We can also further break each price level by another categorical variable. We use rating to see the relative frequency of each rating for each price level. This is done by adding fill=rating in the mapping.

ggplot(data = yelp %>% 
         st_set_geometry(NULL) %>% 
         mutate(rating = round(rating,0) %>% #<<
                  factor(ordered = TRUE))) + #<<
         # delete %>% factor(ordered = T) and see what happens
  geom_bar(mapping = aes(x=price, fill=rating), position = "stack")

By changing position="stack" = position="fill", we convert the Y-axis to the proportion within each level of price and fill it up to the top. This shows more clearly how different rating levels are distributed within each price level.

ggplot(data = yelp %>% 
         st_set_geometry(NULL) %>% 
         mutate(rating = round(rating,0) %>% 
                  factor(ordered = TRUE))) + #<<
  geom_bar(mapping = aes(x=price, fill=rating), position = "fill") #<<

We want rating=5 to be on the top because (I think) it is more intuitive to see higher value on top. We can flip the bar chart vertically by adjusting the levels when we declare rating variable into a factor.

ggplot(data = yelp %>% 
         st_set_geometry(NULL) %>% 
         mutate(rating = round(rating,0) %>% 
                  factor(levels = c(5,4,3,2,1), #<<
                         ordered = TRUE))) +
  geom_bar(mapping = aes(x=price, fill=rating), position = "fill")

Sometimes we want to see the exact figures on top of the bar chart. So, let’s add the percentage within each level of price as labels.

ggplot(data = yelp %>% 
         st_set_geometry(NULL) %>% 
         mutate(rating = round(rating,0) %>% factor(levels = c(5,4,3,2,1), ordered = TRUE))) +
  geom_bar(mapping = aes(x=price, fill=rating), position = "fill") +
  geom_text(data = . %>% 
              # Grouping to calculate % by price and by rating 
              group_by(price, rating) %>% #<<
              # Count rows
              tally() %>%                 #<<
              # Convert to p
              mutate(p = n / sum(n)) %>%  #<<
              # Re-order to match the order in bar chart
              arrange(desc(rating)),      #<<
            aes(x = price, y = p, label = str_c(round(p,3)*100,"%")), color = "white",
            position = position_stack(vjust=0.5)) +
  ggdark::dark_theme_gray() # Dark theme because texts are not visible against white bg

You can flip it 90-degrees.

ggplot(data = yelp %>% 
         st_set_geometry(NULL) %>% 
         mutate(rating = round(rating,0) %>% factor(levels = c(5,4,3,2,1), ordered = TRUE))) +
  geom_bar(mapping = aes(x=price, fill=rating), position = "fill") +
  geom_text(data = . %>% 
              # Grouping to calculate % by price and by rating 
              group_by(price, rating) %>% #<<
              # Count rows
              tally() %>%                 #<<
              # Convert to p
              mutate(p = n / sum(n)) %>%  #<<
              # Re-order to match the order in bar chart
              arrange(desc(rating)),      #<<
            aes(x = price, y = p, label = str_c(round(p,3)*100,"%")), color = "white",
            position = position_stack(vjust=0.5)) +
  coord_flip() +
  ggdark::dark_theme_gray() # Dark theme because texts are not visible against white bg

Customization example (Optional)

I saw this beautiful example by CÉDRIC SCHERER and wanted to show you a Yelp version of the code.

# Code & ideas borrowed heavily from CÉDRIC SCHERER's personal website: 
# https://www.cedricscherer.com/2021/07/05/a-quick-how-to-on-labelling-bar-graphs-in-ggplot2/

max_city_n <- 10

rest_by_city <- yelp %>% 
  st_set_geometry(NULL) %>% 
  group_by(location.city) %>% 
  tally() %>% 
  arrange(desc(n)) %>% 
  slice(1:max_city_n) %>% 
  mutate(location.city = factor(location.city, levels = .$location.city[seq(max_city_n,1)])) %>% 
  # Format text label
  mutate(pct = scales::percent(n / sum(n),accuracy = 0.1),
         pct = case_when(row_number() == 1 ~ str_c(pct, " of all businesses"), TRUE ~ pct)) %>% 
  # Define aesthetic properties - label location
  mutate(nudge = case_when(row_number()==1 ~ 1.05, TRUE ~ -0.2)) %>% 
  # Define aesthetic properties - color
  mutate(color = case_when(row_number()==1 ~ "gray30", TRUE ~ "gray70")) %>% 
  # Color palette
  mutate(pal = c(rep('gray70', max_city_n-4), "coral2", "mediumpurple1", "mediumpurple1", "goldenrod1")) %>% 
  # with() is required to be able to call variables with referencing to data frame
  with(
    # ggplot
    ggplot(data = .) +
      # Bars
      geom_col(mapping = aes(y = location.city, x = n, fill = location.city)) +
      # Text
      geom_text(mapping = aes(y = location.city, x = n, label = pct), 
                # Calling aesthetic properties defined above
                hjust=nudge, color=color, 
                # Font styling
                fontface="bold.italic") + 
      # Stretch x axis
      scale_x_continuous(limits = c(NA, 2200)) +
      # Custom palette
      scale_fill_manual(values = pal, guide="none") +
      # Labels
      labs(x = "Count", y = "Cities", title = "Top 10 cities with most restaurants in Fulton & DeKalb Counties\n") +
      # Dark theme
      ggdark::dark_theme_classic()
  )

rest_by_city

Histogram, boxplot, violin

Histogram:

ggplot(yelp) +
  geom_histogram(mapping = aes(x = review_count),
                 color = "gray50")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Boxplot:

ggplot(yelp) + 
  geom_boxplot(mapping = aes(x = price, y = review_count, fill=price), color = "black") +
  scale_fill_manual(values = c("#999999", "#E69F00", "#56B4E9", "#AF3FF4")) +
  theme_classic()

Violin plot:

ggplot(census_yelp) + 
  geom_violin(mapping = aes(x = county, y = review_count, fill=county), color = "black")

Interactivity

Something I recently found out..

scat_plot <- ggplot(data = census_yelp) +
  geom_point(mapping = aes(x=review_count, y=rating, color = county)) + 
  labs(x = "Review Count in Yelp",
       y = "Rating in Yelp",
       color = "County in Census",
       title = "Do better rated restaurants have more reviews?") +
  scale_color_manual(values = c("green", "darkblue")) + #<<
  theme_bw()

plotly::ggplotly(scat_plot)

So, do restaurants in wealthy neighborhoods get higher Yelp ratings?

census_yelp %>% 
  mutate(review_count_cut = cut(review_count, breaks = quantile(review_count, prob = c(0,0.5,0.75,1)), include.lowest=TRUE)) %>%
  mutate(pct_white = race.white / race.tot) %>% 
  ggplot(data = ., aes(x = hhincome, y = rating)) +
  geom_point(mapping = aes(color = review_count_cut)) +
  scale_color_manual(values = c("gray50", "orange", "red"), labels = c("0 - 50th", "50th- 75th", "75th - 100th")) +
  labs(x = "Annual Household Income", y = "Yelp Rating", color = "Review County (discrete)", title = "Household Income vs. Rating") +
  ggpubr::stat_cor(method = "pearson", label.x = 160000, label.y = 1.5) + 
  ggdark::dark_theme_gray()
## Warning: Removed 139 rows containing non-finite values (stat_cor).
## Warning: Removed 139 rows containing missing values (geom_point).

What other

census_yelp %>% 
  mutate(review_count_cut = cut(review_count, breaks = quantile(review_count, prob = c(0,0.5,0.75,1)), include.lowest=TRUE)) %>%
  mutate(pct_white = race.white / race.tot) %>% 
  ggplot(data = ., aes(x = pct_white, y = rating)) +
  geom_point(mapping = aes(color = review_count_cut)) +
  scale_color_manual(values = c("gray50", "orange", "red"), labels = c("0 - 50th", "50th- 75th", "75th - 100th")) +
  labs(x = "Proportion of White Residents", y = "Yelp Rating", color = "Review County (discrete)", title = "Proportion of White Residents vs. Rating") +
  ggpubr::stat_cor(method = "pearson", label.x = 0.7, label.y = 1.5) + 
  ggdark::dark_theme_gray()
## Warning: Removed 135 rows containing non-finite values (stat_cor).
## Warning: Removed 135 rows containing missing values (geom_point).