Analysis

This is the data analysis page

# loading packages
load("df_clean.RData")
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(plotly)

Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout

Q1

What are the top 10 National Sports Associations with the most subvention granted over the past 5 years?

# making a new data of the average subvention received over the past 5 years, and then taking the top 10 out. 
top10 <- df_clean |> 
  group_by(engname) |>
  summarise(avg_subsidy = mean(subsidy2)) |>
  arrange(-avg_subsidy) |>
  top_n(10, avg_subsidy) 
# making the bar chart and text on the bars for it to be easier to read. 

p1 <- ggplot(top10, 
       aes(x = reorder(engname, avg_subsidy), 
           y = avg_subsidy,
           fill = reorder(engname, -avg_subsidy)
           )) +
  
  geom_col(stat = "engname",
           width = 0.8,
           alpha = 0.8) +
  
  geom_text(aes(label = engname,
                y = avg_subsidy*0.5,
                hjust = 0.5),
                color = "black",
                size = 2.3) +
  
  labs(title = "Top 10 NSAs by Average Subvention", 
       x = "Average Subsidy (HKD)",
       y = "National Sports Associations",
       caption = "Source: data.gov.hk | Author Joshua Kwok") +
  
  scale_y_continuous(labels = scales::comma) +
  
  theme_minimal() +
  
  theme(legend.position = "none",
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
         plot.margin = margin(1, 2, 1, 1, "cm")
        ) +
  
  coord_flip()
Warning in geom_col(stat = "engname", width = 0.8, alpha = 0.8): Ignoring
unknown parameters: `stat`
ggplotly(p1)
ggsave("out/top10.jpg"
       , plot = p1, width = 8, height = 6, dpi = 300)

Q2

What is the overall trend of subvention granted to all NSAs?

# making new df for total subvention granted and percentage change of subvention granted

yearly_total <- df_clean |>
  group_by(year) |>
  summarise(total_subsidy_all = sum(subsidy2)) 

yearly_growth <- yearly_total |>
  mutate(
    growth_rate = (total_subsidy_all - lag(total_subsidy_all)) / lag(total_subsidy_all)
    ) 

glimpse(yearly_growth)
Rows: 5
Columns: 3
$ year              <int> 2017, 2018, 2019, 2020, 2021
$ total_subsidy_all <dbl> 282606527, 307603712, 370198803, 375567923, 449403159
$ growth_rate       <dbl> NA, 0.08845226, 0.20349264, 0.01450334, 0.19659623
# making the trend for subsidy granted 
p2 <- ggplot(yearly_growth,
             aes(x = year,
                 y = growth_rate)
             ) +
  
  geom_line(group=1) +
  
  geom_point(aes(color = year)) +
  
    scale_y_continuous(labels = scales::percent_format(accuracy = 1))+
  
  labs(title = "Percentage Change of Subvention Granted to National Sports Associations",
       x = "Year",  
       y = "Change in Subsidy Granted (%)", 
       caption = "Source: data.gov.hk | Author Joshua Kwok") +
  
  theme(legend.position = "none")


ggplotly(p2)
ggsave("out/percentage.jpg", plot = p2, width = 8, height = 6, dpi = 300)
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_line()`).
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).

Q3

What is the distribution of subvention granted to different NSAs?

# graph for the subsidy overview 2017-2021
overview <- df_clean |>
  group_by(
    engname, year
    ) 
  
p3 <- ggplot(overview,
    aes(x = year, 
        y = subsidy2/1000000)) +
  
  geom_jitter(
     aes(color = engname),
     size= 1,
     width = 0.2,
     alpha = 0.8) +
  
  labs(title = "Subvention Granted to All National Sports Associations 2017-2021",
       x = "Year",  
       y = "Average Subsidy Granted (Million HKD)", 
       caption = "Source: data.gov.hk | Author Joshua Kwok") +
  
  theme_minimal()+
  
  theme(legend.position = "none")

ggplotly(p3)
ggsave("out/overview.jpg", 
       plot = p3, width = 8, height = 6, dpi = 300)

Q4

How has winning medals affected the subvention granted to according NSAs.

# defining the olympic sports associations and NSAs that won medals in international level competitions. 

winning_sports <- c(
  "Hong Kong Fencing Association",
  "Hong Kong China Swimming Association",
  "The Cycling Association of Hong Kong, China Limited",
  "The Hong Kong Table Tennis Association Limited",
  "Hong Kong Rugby Union",
  "Hong Kong Squash")

olympic_sports <- c(
  "Hong Kong Archery Association",
  "The Gymnastics Association of Hong Kong, China",
  "Hong Kong Association of Athletics Affiliates Limited",
  "Hong Kong Badminton Association Limited", 
  "Hong Kong Basketball Association Limited",
  "Volleyball Association of Hong Kong, China Limited",
  "Hong Kong Boxing Association Limited",
  "The Hong Kong Canoe Union Limited", 
  "Hong Kong Equestrian Federation",
  "The Hong Kong Football Association Limited",
  "Hong Kong Golf Association Limited",                       
  "Handball Association of Hong Kong, China Limited",
  "The Hong Kong Hockey Association",                       
  "The Judo Association of Hong Kong, China",
  "Hong Kong, China Rowing Association", 
  "Hong Kong Sailing Federation",                             
  "Hong Kong Shooting Association", 
  "China Hong Kong Mountaineering and Climbing Union Limited",
  "Hong Kong Taekwondo Association Limited",                
  "The Hong Kong Tennis Association Limited",                
  "Hong Kong Triathlon Association Limited")
# Creating the graphs for the medal winning NSAs, with facet

p4 <- df_clean |> 
  filter(engname %in% winning_sports) |> 
  ggplot(
    aes(x = year, 
        y = subsidy2/1000000,
        color = engname),
        ) +
  
  labs(x = NULL, 
       y = "Subsidy in Million HKD",
       title = "Winning Sports Subsidies (2017-2021)",
       color = "NSAs",) +
  
  geom_line(size = 0.5,
            alpha = 0.7) +
  
    geom_point(size = 0.5,
            alpha = 0.7) +
  
    theme_bw() +
  
  theme(legend.position = "none",
        strip.text = element_text(size = 6.5),  
    panel.spacing = unit(0.5, "lines")) +
  
  facet_wrap(~engname)
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
ggplotly(p4)
ggsave("out/winning_sports.jpg", width = 8, height = 6, units = "in")

Q5

Has the subvention granted to NSAs been affected by appearance in Olympics?

p5 <- df_clean |> 
  filter(engname %in% olympic_sports) |> 
  ggplot(
    aes(x = year, 
        y = subsidy2/1000000,
        color = engname)) +
  
  labs(x = NULL, 
       y = NULL,
       title = "Olympic Sports",
       color = "NSAs") +
  
  geom_line(size = 0.5,
            alpha = 0.7) +
  
    geom_point(size = 0.5,
            alpha = 0.7,) +
  
    theme_bw() +
  
  theme(legend.position = "none",
        strip.text = element_text(size = 4.5),  
        panel.spacing = unit(0.3, "lines"),
        axis.text.x = element_text(size = 4),
        axis.text.y = element_text(size = 4)) +
  
  facet_wrap(~engname)


ggplotly(p5)
ggsave("out/olympic_sports.jpg", width = 8, height = 6, units = "in")
 save(p1, p2, p3, p4, p5, file = "plots.RData")