Why visualization?
Data visualization is an essential tool for urban analytics. It offers an intuitive and quick way to understand the characteristics and relationships embedded in your data. Sometimes a good data visualization can deliver a message more effectively than any other medium and exercise transformative power that shapes people’s perception see this example. And not to mention that cool visualizations are cool.
The first thing we analysts do when we get our hands on a dataset is to understand it. The best way to understand any given data is to visualize it. We have a wide variety of maps and charts to choose from, including scatterplots, histograms, boxplots, violin plots, and mapping. We have been doing mapping a lot, so this document will focus more on other charts.
library(tidyverse)
library(sf)
library(tmap)
library(leaflet)
library(here)
library(tidycensus)
# Let's prepare data
yelp <- read_rds("https://github.com/BonwooKoo/UrbanAnalytics2022/blob/main/Lab/module_1/week4/yelp_in.rds?raw=true")
# Census data
census_api_key(Sys.getenv("census_api"))
census_var <- c(hhincome = 'B19019_001',
race.tot = "B02001_001",
race.white = "B02001_002",
race.black = 'B02001_003'
)
census <- get_acs(geography = "tract", state = "GA", county = c("Fulton", "DeKalb"),
output = "wide", geometry = TRUE, year = 2020,
variables = census_var)
summarise_mean <- c(str_c(names(census_var), "E"),
"rating", "review_count")
census_yelp <- census %>%
separate(col = NAME, into=c("tract","county","state"), sep=', ') %>%
# Spatial join
st_join(yelp %>%
mutate(n = 1,
price = nchar(price)) %>%
st_transform(crs = st_crs(census))) %>%
# Group_by
group_by(GEOID, county) %>%
# Mean for all census variables, sum for n
summarise(across(
all_of(summarise_mean), mean),
n = sum(n),
price = median(price)) %>%
# Release grouping
ungroup() %>%
# Drop 'E' from column names
rename_with(function(x) str_sub(x,1,nchar(x)-1), str_c(names(census_var), "E")) %>% # rename_with() renames with a function
# Replace NA in column n&review_count with 0
mutate(across(c(n, review_count), function(x) case_when(is.na(x) ~ 0, TRUE ~ x)))
As usual, using tmap to visualize the data.
tmap_mode("view");
## tmap mode set to interactive viewing
a <- tm_shape(census_yelp) +
tm_polygons(col = "review_count", style = "quantile")
b <- tm_shape(yelp) +
tm_dots(col = "review_count", style="quantile")
tmap_arrange(a,b, sync = TRUE)
Or we can use leaflet() package for mapping.
library(leaflet)
library(htmlwidgets)
library(htmltools)
# CSS for title
tag.map.title <- tags$style(HTML("
.leaflet-control.map_title {
position: absolute;
left: 50px;
width: 320px;
text-align: left;
color: white;
padding-left: 10px;
background: rgba(200,200,200,0.2);
font-weight: bold;
font-size: 20px;
font-family: Helvetica;
border-color: white;
border-radius: 10px;
}"))
# Format title
title <- tags$div(
class="map_title", tag.map.title, HTML("<p>Restaurants in Fulton and DeKalb Counties from Yelp</p>")
)
# Color palette
fill_pal <- colorQuantile(palette = "YlOrRd", domain=yelp$review_count)
# Label for mouseover & popup
yelp_labels <- paste(
"<a href=",yelp$url,">",yelp$name, "</a><br>",
"<strong>Review Count: </strong>", yelp$review_count,"<br>",
"<strong>Rating: </strong>", yelp$rating) %>%
lapply(htmltools::HTML)
# Creating a Leaflet widget
leaflet() %>%
# Setting the view on load
setView(lng = -84.3903996350635, lat = 33.77074368998939, zoom = 11) %>%
# Dark base map
addProviderTiles(providers$CartoDB.DarkMatterNoLabels) %>%
# Polygon boundary
addPolygons(data = census %>% st_union(),
opacity=0.2,
fillOpacity=0,
weight=1,
color="white") %>%
# Yelp point
addCircleMarkers(data = yelp,
radius = yelp$rating*1.5,
opacity=0.2,
fillColor=~fill_pal(review_count),
weight=1,
color= ~fill_pal(review_count),
popup= ~yelp_labels,
label= ~yelp_labels) %>%
# Legend
addLegend("bottomright", pal = fill_pal, values = yelp$review_count,title = "Review Count",opacity = 1) %>%
# Title
addControl(title, position="topleft",className="map_title")
## Warning: sf layer has inconsistent datum (+proj=longlat +datum=NAD83 +no_defs).
## Need '+proj=longlat +datum=WGS84'
For static maps with a lot of customizeability - ggplot2 can
also create maps. See this
example for an inspiration.
Creating a ggplot
In tmap package, two functions always go hand-in-hand, namely tm_shape() and tm_polygons() (or tm_lines, tm_dots, etc.). The tm_shape() function declares the data object to be displayed. Then, the tm_polygons() function defines the geometry shape and other associated characteristics.
A similar structure is used in ggplot2 package. Creating a plot needs
at least two functions that are connected by +: ggplot() function and
geom_point() (or other geometry types, such as geom_line, geom_boxplot,
etc.). In the example below, ggplot(data = yelp)
indicates
that we are drawing ggplot using yelp data. Then,
geom_point(aes(x = review_count, y = rating))
shows that we
are going to draw a scatterplot using review_count
and
rating
columns.
ggplot(data = census_yelp) +
geom_point(mapping = aes(x = review_count, y = rating))
## Warning: Removed 134 rows containing missing values (geom_point).
We can add additional information to this plot using a few different
strategies, including colors, sizes, and shapes inside the
aes()
part of the code. For example, we can add
price
information using color.
ggplot(data = census_yelp) +
geom_point(mapping = aes(x=review_count, y=rating,
color=price)) #<<
## Warning: Removed 134 rows containing missing values (geom_point).
ggplot(data = census_yelp) +
geom_point(mapping = aes(x=review_count, y=rating,
size=price)) #<<
## Warning: Removed 134 rows containing missing values (geom_point).
fig1 <- ggplot(data = census_yelp) +
geom_point(mapping = aes(x=review_count, y=rating,
alpha=price)) #<<
fig2 <- ggplot(data = census_yelp) +
geom_point(mapping = aes(x=review_count, y=rating,
alpha=price, #<<
size=price)) #<<
gridExtra::grid.arrange(fig1, fig2, ncol= 1)
## Warning: Removed 134 rows containing missing values (geom_point).
## Removed 134 rows containing missing values (geom_point).
If you put, for example, color and size arguments outside of
aes()
, the visual property from those arguments are not
mapped to your data; it is applied direct to the plot.
fig2 <- ggplot(data = census_yelp) +
geom_point(mapping = aes(x=review_count, y=rating),
color = "orange") #<<
fig3 <- ggplot(data = census_yelp) +
geom_point(mapping = aes(x=review_count, y=rating),
size = 5) #<<
gridExtra::grid.arrange(fig2, fig3, ncol = 1)
## Warning: Removed 134 rows containing missing values (geom_point).
## Removed 134 rows containing missing values (geom_point).
Remember that if you use facet_wrap
with a continuous
data, it will generate as many plots as the unique values in the
continuous data. Avoid writing such codes!
ggplot(data = census_yelp) +
geom_point(mapping = aes(x=review_count, y=rating)) +
facet_wrap(~county) #<<
## Warning: Removed 134 rows containing missing values (geom_point).
ggplot(data = census_yelp) +
geom_point(mapping = aes(x=review_count, y=rating)) +
facet_grid(price~county) #<<
## Warning: Removed 134 rows containing missing values (geom_point).
# price will be rows; county will be columns.
# try swapping price and county
ggplot(data = census_yelp) +
geom_smooth(mapping = aes(x=review_count, y=rating), method = "lm")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 134 rows containing non-finite values (stat_smooth).
# More than one layers
ggplot(data = census_yelp) +
geom_point(mapping = aes(x=review_count, y=rating)) +
geom_smooth(mapping = aes(x=review_count, y=rating), method = "lm")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 134 rows containing non-finite values (stat_smooth).
## Warning: Removed 134 rows containing missing values (geom_point).
In the code above, we are repeating
aes(x=review_count, y=rating)
twice. If we know that
mapping will be the same in multiple layers, we can define it in
ggplot()
.
ggplot(data = census_yelp, mapping = aes(x=review_count, y=rating)) + #<<
geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 134 rows containing non-finite values (stat_smooth).
## Warning: Removed 134 rows containing missing values (geom_point).
If you want to add a specific mapping to a layer, you provide additional mapping to individual layers.
ggplot(data = census_yelp, mapping = aes(x=review_count, y=rating)) +
geom_point() +
geom_smooth(mapping = aes(color = county), #<<
method = "lm")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 134 rows containing non-finite values (stat_smooth).
## Warning: Removed 134 rows containing missing values (geom_point).
You can append
labs()
to specify labels.
ggplot(data = census_yelp, mapping = aes(x=review_count, y=rating)) +
geom_point() +
geom_smooth(mapping = aes(color = county), method = "lm") +
labs(x = "Review Count in Yelp", #<<
y = "Rating in Yelp",
color = "County in Census",
title = "Do better rated restaurants have more reviews?")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 134 rows containing non-finite values (stat_smooth).
## Warning: Removed 134 rows containing missing values (geom_point).
Aesthetic options
We can change the overall theme of the plot using
theme_<...>
.
ggplot(data = census_yelp) +
geom_point(mapping = aes(x=review_count, y=rating, color = county)) +
labs(x = "Review Count in Yelp",
y = "Rating in Yelp",
color = "County in Census",
title = "Do better rated restaurants have more reviews?") +
theme_bw()
## Warning: Removed 134 rows containing missing values (geom_point).
Of course, dark is always cooler.
ggplot(data = census_yelp) +
geom_point(mapping = aes(x=review_count, y=rating, color = county)) +
labs(x = "Review Count in Yelp",
y = "Rating in Yelp",
color = "County in Census",
title = "Do better rated restaurants have more reviews?") +
ggdark::dark_theme_gray()
## Inverted geom defaults of fill and color/colour.
## To change them back, use invert_geom_defaults().
## Warning: Removed 134 rows containing missing values (geom_point).
If you want to use your custom color choices - for a discrete variable.
ggplot(data = census_yelp) +
geom_point(mapping = aes(x=review_count, y=rating, color = county)) +
labs(x = "Review Count in Yelp",
y = "Rating in Yelp",
color = "County in Census",
title = "Do better rated restaurants have more reviews?") +
scale_color_manual(values = c("green", "darkblue")) + #<<
theme_bw()
## Warning: Removed 134 rows containing missing values (geom_point).
If you want to use your custom color choices - for a continuous
variable.
ggplot(data = census_yelp) +
geom_point(mapping = aes(x=review_count, y=rating, color = hhincome)) +
labs(x = "Review Count in Yelp",
y = "Rating in Yelp",
color = "Annual Household Income",
title = "Do better rated restaurants have more reviews?") +
scale_color_gradient(low="darkblue", high="red") + #<<
theme_bw()
## Warning: Removed 134 rows containing missing values (geom_point).
outliers <- census_yelp %>%
arrange(desc(review_count)) %>%
slice(1:4)
ggplot(data = census_yelp,
aes(x=review_count, y=rating)) + # moved aes() to here
geom_point(mapping = aes(color = hhincome)) + # Colored ones
geom_point(data = outliers, size = 3, shape = 1, color = "black") + # Black circles
ggrepel::geom_label_repel(data = outliers, mapping = aes(label = county)) +
labs(x = "Review Count in Yelp",
y = "Rating in Yelp",
color = "Annual Household Income",
title = "Do better rated restaurants have more reviews?") +
scale_color_gradient(low="darkblue", high="red") + #<<
theme_light()
## Warning: Removed 134 rows containing missing values (geom_point).
Other plots
Bar chart is also very frequently used. Note that ggplot creates
y-axis automatically by examining how many rows there are for each
category of x. You can try
yelp %>% group_by(price) %>% tally()
to check the
exact Y value for this plot.
ggplot(data = yelp) +
geom_bar(mapping = aes(x=price))
yelp %>%
group_by(price) %>%
tally()
## Simple feature collection with 4 features and 2 fields
## Geometry type: MULTIPOINT
## Dimension: XY
## Bounding box: xmin: -84.74486 ymin: 33.51273 xmax: -84.07477 ymax: 34.16806
## Geodetic CRS: WGS 84
## # A tibble: 4 × 3
## price n geometry
## <chr> <int> <MULTIPOINT [°]>
## 1 $ 1653 ((-84.66889 33.51819), (-84.66473 33.52469), (-84.65964 33.52801)…
## 2 $$ 1850 ((-84.73536 33.51273), (-84.73738 33.52528), (-84.73587 33.52488)…
## 3 $$$ 141 ((-84.74486 33.52821), (-84.54959 33.66628), (-84.44467 33.6685),…
## 4 $$$$ 23 ((-84.36382 33.75925), (-84.38283 33.78418), (-84.38258 33.77639)…
We can also further break each price level by another categorical
variable. We use rating to see the relative frequency of each rating for
each price level. This is done by adding fill=rating
in the
mapping.
ggplot(data = yelp %>%
st_set_geometry(NULL) %>%
mutate(rating = round(rating,0) %>% #<<
factor(ordered = TRUE))) + #<<
# delete %>% factor(ordered = T) and see what happens
geom_bar(mapping = aes(x=price, fill=rating), position = "stack")
By changing position="stack"
=
position="fill"
, we convert the Y-axis to the proportion
within each level of price and fill it up to the top. This shows more
clearly how different rating levels are distributed within each price
level.
ggplot(data = yelp %>%
st_set_geometry(NULL) %>%
mutate(rating = round(rating,0) %>%
factor(ordered = TRUE))) + #<<
geom_bar(mapping = aes(x=price, fill=rating), position = "fill") #<<
We want rating=5 to be on the top because (I think) it is more
intuitive to see higher value on top. We can flip the bar chart
vertically by adjusting the levels
when we declare
rating
variable into a factor.
ggplot(data = yelp %>%
st_set_geometry(NULL) %>%
mutate(rating = round(rating,0) %>%
factor(levels = c(5,4,3,2,1), #<<
ordered = TRUE))) +
geom_bar(mapping = aes(x=price, fill=rating), position = "fill")
Sometimes we want to see the exact figures on top of the bar chart. So, let’s add the percentage within each level of price as labels.
ggplot(data = yelp %>%
st_set_geometry(NULL) %>%
mutate(rating = round(rating,0) %>% factor(levels = c(5,4,3,2,1), ordered = TRUE))) +
geom_bar(mapping = aes(x=price, fill=rating), position = "fill") +
geom_text(data = . %>%
# Grouping to calculate % by price and by rating
group_by(price, rating) %>% #<<
# Count rows
tally() %>% #<<
# Convert to p
mutate(p = n / sum(n)) %>% #<<
# Re-order to match the order in bar chart
arrange(desc(rating)), #<<
aes(x = price, y = p, label = str_c(round(p,3)*100,"%")), color = "white",
position = position_stack(vjust=0.5)) +
ggdark::dark_theme_gray() # Dark theme because texts are not visible against white bg
You can flip it 90-degrees.
ggplot(data = yelp %>%
st_set_geometry(NULL) %>%
mutate(rating = round(rating,0) %>% factor(levels = c(5,4,3,2,1), ordered = TRUE))) +
geom_bar(mapping = aes(x=price, fill=rating), position = "fill") +
geom_text(data = . %>%
# Grouping to calculate % by price and by rating
group_by(price, rating) %>% #<<
# Count rows
tally() %>% #<<
# Convert to p
mutate(p = n / sum(n)) %>% #<<
# Re-order to match the order in bar chart
arrange(desc(rating)), #<<
aes(x = price, y = p, label = str_c(round(p,3)*100,"%")), color = "white",
position = position_stack(vjust=0.5)) +
coord_flip() +
ggdark::dark_theme_gray() # Dark theme because texts are not visible against white bg
Customization example (Optional)
I saw this beautiful example by CÉDRIC SCHERER and wanted to show you a Yelp version of the code.
# Code & ideas borrowed heavily from CÉDRIC SCHERER's personal website:
# https://www.cedricscherer.com/2021/07/05/a-quick-how-to-on-labelling-bar-graphs-in-ggplot2/
max_city_n <- 10
rest_by_city <- yelp %>%
st_set_geometry(NULL) %>%
group_by(location.city) %>%
tally() %>%
arrange(desc(n)) %>%
slice(1:max_city_n) %>%
mutate(location.city = factor(location.city, levels = .$location.city[seq(max_city_n,1)])) %>%
# Format text label
mutate(pct = scales::percent(n / sum(n),accuracy = 0.1),
pct = case_when(row_number() == 1 ~ str_c(pct, " of all businesses"), TRUE ~ pct)) %>%
# Define aesthetic properties - label location
mutate(nudge = case_when(row_number()==1 ~ 1.05, TRUE ~ -0.2)) %>%
# Define aesthetic properties - color
mutate(color = case_when(row_number()==1 ~ "gray30", TRUE ~ "gray70")) %>%
# Color palette
mutate(pal = c(rep('gray70', max_city_n-4), "coral2", "mediumpurple1", "mediumpurple1", "goldenrod1")) %>%
# with() is required to be able to call variables with referencing to data frame
with(
# ggplot
ggplot(data = .) +
# Bars
geom_col(mapping = aes(y = location.city, x = n, fill = location.city)) +
# Text
geom_text(mapping = aes(y = location.city, x = n, label = pct),
# Calling aesthetic properties defined above
hjust=nudge, color=color,
# Font styling
fontface="bold.italic") +
# Stretch x axis
scale_x_continuous(limits = c(NA, 2200)) +
# Custom palette
scale_fill_manual(values = pal, guide="none") +
# Labels
labs(x = "Count", y = "Cities", title = "Top 10 cities with most restaurants in Fulton & DeKalb Counties\n") +
# Dark theme
ggdark::dark_theme_classic()
)
rest_by_city
Histogram, boxplot, violin
Histogram:
ggplot(yelp) +
geom_histogram(mapping = aes(x = review_count),
color = "gray50")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Boxplot:
ggplot(yelp) +
geom_boxplot(mapping = aes(x = price, y = review_count, fill=price), color = "black") +
scale_fill_manual(values = c("#999999", "#E69F00", "#56B4E9", "#AF3FF4")) +
theme_classic()
Violin plot:
ggplot(census_yelp) +
geom_violin(mapping = aes(x = county, y = review_count, fill=county), color = "black")
Interactivity
Something I recently found out..
scat_plot <- ggplot(data = census_yelp) +
geom_point(mapping = aes(x=review_count, y=rating, color = county)) +
labs(x = "Review Count in Yelp",
y = "Rating in Yelp",
color = "County in Census",
title = "Do better rated restaurants have more reviews?") +
scale_color_manual(values = c("green", "darkblue")) + #<<
theme_bw()
plotly::ggplotly(scat_plot)
So, do restaurants in wealthy neighborhoods get higher Yelp ratings?
census_yelp %>%
mutate(review_count_cut = cut(review_count, breaks = quantile(review_count, prob = c(0,0.5,0.75,1)), include.lowest=TRUE)) %>%
mutate(pct_white = race.white / race.tot) %>%
ggplot(data = ., aes(x = hhincome, y = rating)) +
geom_point(mapping = aes(color = review_count_cut)) +
scale_color_manual(values = c("gray50", "orange", "red"), labels = c("0 - 50th", "50th- 75th", "75th - 100th")) +
labs(x = "Annual Household Income", y = "Yelp Rating", color = "Review County (discrete)", title = "Household Income vs. Rating") +
ggpubr::stat_cor(method = "pearson", label.x = 160000, label.y = 1.5) +
ggdark::dark_theme_gray()
## Warning: Removed 139 rows containing non-finite values (stat_cor).
## Warning: Removed 139 rows containing missing values (geom_point).
What other
census_yelp %>%
mutate(review_count_cut = cut(review_count, breaks = quantile(review_count, prob = c(0,0.5,0.75,1)), include.lowest=TRUE)) %>%
mutate(pct_white = race.white / race.tot) %>%
ggplot(data = ., aes(x = pct_white, y = rating)) +
geom_point(mapping = aes(color = review_count_cut)) +
scale_color_manual(values = c("gray50", "orange", "red"), labels = c("0 - 50th", "50th- 75th", "75th - 100th")) +
labs(x = "Proportion of White Residents", y = "Yelp Rating", color = "Review County (discrete)", title = "Proportion of White Residents vs. Rating") +
ggpubr::stat_cor(method = "pearson", label.x = 0.7, label.y = 1.5) +
ggdark::dark_theme_gray()
## Warning: Removed 135 rows containing non-finite values (stat_cor).
## Warning: Removed 135 rows containing missing values (geom_point).