Cleaning

This is the Data Cleaning Page

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)

Data Import

df <- read_csv("data/sports_sss.csv")
Rows: 310 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): Year, NSA_name_en, NSA_name_tc, NSA_name_sc, Subvention_granted

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
# A tibble: 6 × 5
  Year    NSA_name_en                 NSA_name_tc NSA_name_sc Subvention_granted
  <chr>   <chr>                       <chr>       <chr>       <chr>             
1 2021-22 Hong Kong Archery Associat… 香港射箭總會…… 香港射箭总会…… 6,053,025         
2 2021-22 Hong Kong Association of A… 香港田徑總會有限公司… 香港田径总会有限公司… 8,507,101         
3 2021-22 Hong Kong Badminton Associ… 香港羽毛球總會有限公… 香港羽毛球总会有限公… 20,513,305        
4 2021-22 The Hong Kong Baseball Ass… 香港棒球總會有限公司… 香港棒球总会有限公司… 8,950,382         
5 2021-22 Hong Kong Basketball Assoc… 香港籃球總會有限公司… 香港篮球总会有限公司… 14,105,445        
6 2021-22 Hong Kong Billiard Sports … 香港桌球總會有限公司… 香港桌球总会有限公司… 5,625,452         
str(df)
spc_tbl_ [310 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ Year              : chr [1:310] "2021-22" "2021-22" "2021-22" "2021-22" ...
 $ NSA_name_en       : chr [1:310] "Hong Kong Archery Association" "Hong Kong Association of Athletics Affiliates Limited" "Hong Kong Badminton Association Limited" "The Hong Kong Baseball Association Limited" ...
 $ NSA_name_tc       : chr [1:310] "香港射箭總會" "香港田徑總會有限公司" "香港羽毛球總會有限公司" "香港棒球總會有限公司" ...
 $ NSA_name_sc       : chr [1:310] "香港射箭总会" "香港田径总会有限公司" "香港羽毛球总会有限公司" "香港棒球总会有限公司" ...
 $ Subvention_granted: chr [1:310] "6,053,025" "8,507,101" "20,513,305" "8,950,382" ...
 - attr(*, "spec")=
  .. cols(
  ..   Year = col_character(),
  ..   NSA_name_en = col_character(),
  ..   NSA_name_tc = col_character(),
  ..   NSA_name_sc = col_character(),
  ..   Subvention_granted = col_character()
  .. )
 - attr(*, "problems")=<externalptr> 

Data Cleaning

# Renaming the columns

df1  <- df |> 
  rename(`engname` = `NSA_name_en`,
         `chiname` = `NSA_name_tc`, 
         `subsidy` = `Subvention_granted`)
head(df1)
# A tibble: 6 × 5
  Year    engname                                    chiname NSA_name_sc subsidy
  <chr>   <chr>                                      <chr>   <chr>       <chr>  
1 2021-22 Hong Kong Archery Association              香港射箭總會… 香港射箭总会…… 6,053,…
2 2021-22 Hong Kong Association of Athletics Affili… 香港田徑總會… 香港田径总会有限公司… 8,507,…
3 2021-22 Hong Kong Badminton Association Limited    香港羽毛球總… 香港羽毛球总会有限公… 20,513…
4 2021-22 The Hong Kong Baseball Association Limited 香港棒球總會… 香港棒球总会有限公司… 8,950,…
5 2021-22 Hong Kong Basketball Association Limited   香港籃球總會… 香港篮球总会有限公司… 14,105…
6 2021-22 Hong Kong Billiard Sports Control Council… 香港桌球總會… 香港桌球总会有限公司… 5,625,…
# Cleaning the numbers and changing them from characters to numeric expressions. 

df2 <- df1 |>
  mutate(year = substr(`Year`, 1, 4)) |>
  mutate(year = as.integer(year)) |>
  mutate(subsidy2 = as.numeric(gsub(",","", subsidy))) |>
  drop_na(subsidy2)
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `subsidy2 = as.numeric(gsub(",", "", subsidy))`.
Caused by warning:
! NAs introduced by coercion
head(df2)
# A tibble: 6 × 7
  Year    engname                     chiname NSA_name_sc subsidy  year subsidy2
  <chr>   <chr>                       <chr>   <chr>       <chr>   <int>    <dbl>
1 2021-22 Hong Kong Archery Associat… 香港射箭總會… 香港射箭总会…… 6,053,…  2021  6053025
2 2021-22 Hong Kong Association of A… 香港田徑總會… 香港田径总会有限公司… 8,507,…  2021  8507101
3 2021-22 Hong Kong Badminton Associ… 香港羽毛球總… 香港羽毛球总会有限公… 20,513…  2021 20513305
4 2021-22 The Hong Kong Baseball Ass… 香港棒球總會… 香港棒球总会有限公司… 8,950,…  2021  8950382
5 2021-22 Hong Kong Basketball Assoc… 香港籃球總會… 香港篮球总会有限公司… 14,105…  2021 14105445
6 2021-22 Hong Kong Billiard Sports … 香港桌球總會… 香港桌球总会有限公司… 5,625,…  2021  5625452
df_clean <- df2 |>
  select(year, engname, chiname, subsidy2) 

head(df_clean)
# A tibble: 6 × 4
   year engname                                                 chiname subsidy2
  <int> <chr>                                                   <chr>      <dbl>
1  2021 Hong Kong Archery Association                           香港射箭總會…  6053025
2  2021 Hong Kong Association of Athletics Affiliates Limited   香港田徑總會…  8507101
3  2021 Hong Kong Badminton Association Limited                 香港羽毛球總… 20513305
4  2021 The Hong Kong Baseball Association Limited              香港棒球總會…  8950382
5  2021 Hong Kong Basketball Association Limited                香港籃球總會… 14105445
6  2021 Hong Kong Billiard Sports Control Council Company Limi… 香港桌球總會…  5625452
# Saving the df to an RData file
save(df_clean, file = "df_clean.RData")