Data wrangling and visualisation

library(bagyo)
library(dplyr)
library(tidyr)
library(ggplot2)
library(lubridate)

Data wrangling

Following are some examples of how the bagyo dataset can be used to demonstrate various data wrangling approaches, particularly those using the tidyverse packages.

Creating summaries

## Get number of cyclone categories per year ----
bagyo |>
  group_by(year, category_name) |>
  count() |>
  group_by(year) |>
  complete(category_name) |>
  ungroup()
#> # A tibble: 20 × 3
#>     year category_name             n
#>    <dbl> <fct>                 <int>
#>  1  2017 Tropical Depression       5
#>  2  2017 Tropical Storm            9
#>  3  2017 Severe Tropical Storm     5
#>  4  2017 Typhoon                   3
#>  5  2017 Super Typhoon            NA
#>  6  2018 Tropical Depression       4
#>  7  2018 Tropical Storm            7
#>  8  2018 Severe Tropical Storm     4
#>  9  2018 Typhoon                   6
#> 10  2018 Super Typhoon            NA
#> 11  2019 Tropical Depression       8
#> 12  2019 Tropical Storm            2
#> 13  2019 Severe Tropical Storm     3
#> 14  2019 Typhoon                   8
#> 15  2019 Super Typhoon            NA
#> 16  2020 Tropical Depression       6
#> 17  2020 Tropical Storm            7
#> 18  2020 Severe Tropical Storm     3
#> 19  2020 Typhoon                   4
#> 20  2020 Super Typhoon             2
## Get yearly mean cyclone pressure and speed ----
bagyo |>
  group_by(year) |>
  summarise(mean_pressure = mean(pressure), mean_speed = mean(speed))
#> # A tibble: 4 × 3
#>    year mean_pressure mean_speed
#>   <dbl>         <dbl>      <dbl>
#> 1  2017          986.       88.0
#> 2  2018          961.       66.7
#> 3  2019          976.       59.0
#> 4  2020          973.       62.0
## Get cyclone category mean pressure and speed ----
bagyo |>
  group_by(category_name) |>
  summarise(
    n = n(),
    mean_pressure = mean(pressure), 
    mean_speed = mean(speed)
  )
#> # A tibble: 5 × 4
#>   category_name             n mean_pressure mean_speed
#>   <fct>                 <int>         <dbl>      <dbl>
#> 1 Tropical Depression      23          996.       39.8
#> 2 Tropical Storm           25          986.       61.6
#> 3 Severe Tropical Storm    15          978.       75  
#> 4 Typhoon                  21          941.      102. 
#> 5 Super Typhoon             2          908.      112.

Working with date and time data

## Get cyclone category mean duration (in hours) ----
bagyo |>
  mutate(duration = end - start) |>
  group_by(category_name) |>
  summarise(mean_duration = mean(duration))
#> # A tibble: 5 × 2
#>   category_name         mean_duration  
#>   <fct>                 <drtn>         
#> 1 Tropical Depression    46.69565 hours
#> 2 Tropical Storm         57.48000 hours
#> 3 Severe Tropical Storm  79.13333 hours
#> 4 Typhoon               106.66667 hours
#> 5 Super Typhoon          77.50000 hours
## Get number of cyclones per month by year ----
bagyo |>
  mutate(month = month(start, label = TRUE)) |>
  group_by(month, year) |>
  count() |>
  ungroup() |>
  complete(month, year, fill = list(n = 0)) |>
  arrange(year, month)
#> # A tibble: 48 × 3
#>    month  year     n
#>    <ord> <dbl> <int>
#>  1 Jan    2017     1
#>  2 Feb    2017     1
#>  3 Mar    2017     0
#>  4 Apr    2017     2
#>  5 May    2017     0
#>  6 Jun    2017     0
#>  7 Jul    2017     4
#>  8 Aug    2017     2
#>  9 Sep    2017     4
#> 10 Oct    2017     3
#> # ℹ 38 more rows

Data visualisation

Following are some examples of how the bagyo dataset can be used to demonstrate various data visualisation approaches, particularly those using the tidyverse and ggplot2 packages.

Bar plots

## Get cyclone category mean duration (in hours) ----
bagyo |>
  mutate(duration = end - start) |>
  group_by(category_name) |>
  summarise(mean_duration = mean(duration)) |>
  ggplot(mapping = aes(x = mean_duration, y = category_name)) +
  geom_col(colour = "#4b876e", fill = "#4b876e", alpha = 0.5) +
  labs(
    title = "Mean duration of cyclones",
    subtitle = "By cyclone categories",
    x = "mean duration (hours)",
    y = NULL
  ) +
  theme_minimal() +
  theme(
    panel.grid.minor.x = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.y = element_blank()
  )

Scatter plots

## Cyclone speed by presssure ----
bagyo |>
  dplyr::mutate(year = factor(year)) |>
  ggplot(mapping = aes(x = speed, y = pressure)) +
  geom_point(mapping = aes(colour = category_name), size = 3, alpha = 0.5) +
  scale_colour_manual(
    name = NULL,
    values = c("#9c5e60", "#4b876e", "#465b92", "#e5be72", "#5d0505")
  ) +
  labs(
    title = "Cyclone maximum sustained wind speed and maximum central pressure",
    subtitle = "By cyclone categories and year",
    x = "wind speed (km/h)",
    y = "central pressure (hPa)"
  ) +
  facet_wrap(. ~ year, ncol = 4) +
  theme_bw() +
  theme(
    legend.position = "top",
    strip.background = element_rect(
      fill = alpha("#465b92", 0.7), colour = "#465b92"
    ),
    panel.border = element_rect(colour = "#465b92"),
    panel.grid.minor = element_blank()
  )

bagyo |>
  mutate(
    year = factor(year),
    duration = as.numeric(end - start)
  ) |>
  ggplot(mapping = aes(x = speed, y = duration)) +
  geom_point(
    mapping = aes(colour = year, shape = year), size = 3, alpha = 0.5
  ) +
  geom_smooth(
    mapping = aes(colour = year), method = "lm", se = FALSE, linewidth = 0.75
  ) +
  scale_colour_manual(
    values = c("#9c5e60", "#4b876e", "#465b92", "#e5be72")
  ) +
  scale_shape_manual(values = 15:18) +
  labs(
    title = "Maximum sustained wind speed by duration of cyclones",
    subtitle = "2017-2020",
    x = "speed (km/h)", y = "duration (hours)",
    colour = "Year", shape = "Year"
  ) +
  theme_minimal() +
  theme(legend.position = "top")

Time series

## Get number of cyclones per month by year and plot ----
bagyo |>
  mutate(month = month(start, label = TRUE)) |>
  group_by(month, year) |>
  count() |>
  ungroup() |>
  complete(month, year, fill = list(n = 0)) |>
  arrange(year, month) |>
  ggplot(mapping = aes(x = month, y = n)) +
  geom_col(colour = "#4b876e", fill = "#4b876e", alpha = 0.5) +
  scale_y_continuous(breaks = seq(from = 0, to = 6, by = 1)) +
  labs(
    title = "Number of cyclones over time",
    subtitle = "2017-2020",
    x = NULL,
    y = "n"
  ) +
  facet_wrap(. ~ year, ncol = 4) +
  theme_bw() +
  theme(
    strip.background = element_rect(
      fill = alpha("#465b92", 0.7), colour = "#465b92"
    ),
    panel.border = element_rect(colour = "#465b92"),
    panel.grid.minor.y = element_blank(),
    panel.grid.major.x = element_blank(),
    axis.text.x = element_text(size = 10, angle = 90, hjust = 1, vjust = 0.5)
  )

Distribution plots

bagyo |>
  mutate(year = factor(year)) |>
  ggplot(mapping = aes(x = year, y = speed)) +
  geom_boxplot(colour = "#4b876e", fill = "#4b876e", alpha = 0.5) +
  labs(
    title = "Distribution of tropical cyclone maximum sustained wind speed",
    subtitle = "2017-2022",
    x = NULL, y = "speed (km/h)"
  ) +
  theme_minimal() +
  theme(panel.grid.major.x = element_blank())

bagyo |>
  mutate(year = factor(year)) |>
  ggplot(mapping = aes(x = year, y = speed)) +
  geom_boxplot(colour = "#4b876e") +
  geom_jitter(
    colour = "#4b876e", fill = "#4b876e", alpha = 0.5,
    shape = 21, size = 2, width = 0.2
  ) +
  labs(
    title = "Distribution of tropical cyclone maximum sustained wind speed",
    subtitle = "2017-2022",
    x = NULL, y = "speed (km/h)"
  ) +
  theme_minimal() +
  theme(panel.grid.major.x = element_blank())

bagyo |>
  mutate(year = factor(year)) |>
  ggplot(mapping = aes(x = year, y = speed)) +
  geom_violin(colour = "#4b876e", fill = "#4b876e", alpha = 0.5) +
  geom_jitter(colour = "#4b876e", size = 3, width = 0.2) +
  labs(
    title = "Distribution of tropical cyclone maximum sustained wind speed",
    subtitle = "2017-2022",
    x = NULL, y = "speed (km/h)"
  ) +
  theme_minimal() +
  theme(panel.grid.major.x = element_blank())