Introduction

csfmt_rts_data_v1

csfmt_rts_data_v1 (vignette("csfmt_rts_data_v1", package = "cstidy")) is a data format for real-time surveillance.

d <- cstidy::generate_test_data()
cstidy::set_csfmt_rts_data_v1(d)

# Looking at the dataset
d[]
#>     granularity_time granularity_geo country_iso3 location_code border     age
#>  1:      isoyearweek          county          nor  county_nor42     NA    <NA>
#>  2:      isoyearweek          county          nor  county_nor32     NA    <NA>
#>  3:      isoyearweek          county          nor  county_nor33     NA    <NA>
#>  4:      isoyearweek          county          nor  county_nor56     NA    <NA>
#>  5:      isoyearweek          county          nor  county_nor34     NA    <NA>
#>  6:      isoyearweek          county          nor  county_nor15     NA    <NA>
#>  7:      isoyearweek          county          nor  county_nor18     NA    <NA>
#>  8:      isoyearweek          county          nor  county_nor03     NA    <NA>
#>  9:      isoyearweek          county          nor  county_nor11     NA    <NA>
#> 10:      isoyearweek          county          nor  county_nor40     NA    <NA>
#> 11:      isoyearweek          county          nor  county_nor55     NA    <NA>
#> 12:      isoyearweek          county          nor  county_nor50     NA    <NA>
#> 13:      isoyearweek          county          nor  county_nor39     NA    <NA>
#> 14:      isoyearweek          county          nor  county_nor46     NA    <NA>
#> 15:      isoyearweek          county          nor  county_nor31     NA    <NA>
#> 16:      isoyearweek          county          nor  county_nor42     NA   total
#> 17:      isoyearweek          county          nor  county_nor32     NA   total
#> 18:      isoyearweek          county          nor  county_nor33     NA   total
#> 19:      isoyearweek          county          nor  county_nor56     NA   total
#> 20:      isoyearweek          county          nor  county_nor34     NA   total
#> 21:      isoyearweek          county          nor  county_nor15     NA   total
#> 22:      isoyearweek          county          nor  county_nor18     NA   total
#> 23:      isoyearweek          county          nor  county_nor03     NA   total
#> 24:      isoyearweek          county          nor  county_nor11     NA   total
#> 25:      isoyearweek          county          nor  county_nor40     NA   total
#> 26:      isoyearweek          county          nor  county_nor55     NA   total
#> 27:      isoyearweek          county          nor  county_nor50     NA   total
#> 28:      isoyearweek          county          nor  county_nor39     NA   total
#> 29:      isoyearweek          county          nor  county_nor46     NA   total
#> 30:      isoyearweek          county          nor  county_nor31     NA   total
#> 31:      isoyearweek          county          nor  county_nor42     NA 000_005
#> 32:      isoyearweek          county          nor  county_nor32     NA 000_005
#> 33:      isoyearweek          county          nor  county_nor33     NA 000_005
#> 34:      isoyearweek          county          nor  county_nor56     NA 000_005
#> 35:      isoyearweek          county          nor  county_nor34     NA 000_005
#> 36:      isoyearweek          county          nor  county_nor15     NA 000_005
#> 37:      isoyearweek          county          nor  county_nor18     NA 000_005
#> 38:      isoyearweek          county          nor  county_nor03     NA 000_005
#> 39:      isoyearweek          county          nor  county_nor11     NA 000_005
#> 40:      isoyearweek          county          nor  county_nor40     NA 000_005
#> 41:      isoyearweek          county          nor  county_nor55     NA 000_005
#> 42:      isoyearweek          county          nor  county_nor50     NA 000_005
#> 43:      isoyearweek          county          nor  county_nor39     NA 000_005
#> 44:      isoyearweek          county          nor  county_nor46     NA 000_005
#> 45:      isoyearweek          county          nor  county_nor31     NA 000_005
#>     granularity_time granularity_geo country_iso3 location_code border     age
#>       sex isoyear isoweek isoyearweek    season seasonweek calyear calmonth
#>  1:  <NA>    2022       3     2022-03 2021/2022         26      NA       NA
#>  2:  <NA>    2022       3     2022-03 2021/2022         26      NA       NA
#>  3:  <NA>    2022       3     2022-03 2021/2022         26      NA       NA
#>  4:  <NA>    2022       3     2022-03 2021/2022         26      NA       NA
#>  5:  <NA>    2022       3     2022-03 2021/2022         26      NA       NA
#>  6:  <NA>    2022       3     2022-03 2021/2022         26      NA       NA
#>  7:  <NA>    2022       3     2022-03 2021/2022         26      NA       NA
#>  8:  <NA>    2022       3     2022-03 2021/2022         26      NA       NA
#>  9:  <NA>    2022       3     2022-03 2021/2022         26      NA       NA
#> 10:  <NA>    2022       3     2022-03 2021/2022         26      NA       NA
#> 11:  <NA>    2022       3     2022-03 2021/2022         26      NA       NA
#> 12:  <NA>    2022       3     2022-03 2021/2022         26      NA       NA
#> 13:  <NA>    2022       3     2022-03 2021/2022         26      NA       NA
#> 14:  <NA>    2022       3     2022-03 2021/2022         26      NA       NA
#> 15:  <NA>    2022       3     2022-03 2021/2022         26      NA       NA
#> 16: total    2022       3     2022-03 2021/2022         26      NA       NA
#> 17: total    2022       3     2022-03 2021/2022         26      NA       NA
#> 18: total    2022       3     2022-03 2021/2022         26      NA       NA
#> 19: total    2022       3     2022-03 2021/2022         26      NA       NA
#> 20: total    2022       3     2022-03 2021/2022         26      NA       NA
#> 21: total    2022       3     2022-03 2021/2022         26      NA       NA
#> 22: total    2022       3     2022-03 2021/2022         26      NA       NA
#> 23: total    2022       3     2022-03 2021/2022         26      NA       NA
#> 24: total    2022       3     2022-03 2021/2022         26      NA       NA
#> 25: total    2022       3     2022-03 2021/2022         26      NA       NA
#> 26: total    2022       3     2022-03 2021/2022         26      NA       NA
#> 27: total    2022       3     2022-03 2021/2022         26      NA       NA
#> 28: total    2022       3     2022-03 2021/2022         26      NA       NA
#> 29: total    2022       3     2022-03 2021/2022         26      NA       NA
#> 30: total    2022       3     2022-03 2021/2022         26      NA       NA
#> 31: total    2022       3     2022-03 2021/2022         26      NA       NA
#> 32: total    2022       3     2022-03 2021/2022         26      NA       NA
#> 33: total    2022       3     2022-03 2021/2022         26      NA       NA
#> 34: total    2022       3     2022-03 2021/2022         26      NA       NA
#> 35: total    2022       3     2022-03 2021/2022         26      NA       NA
#> 36: total    2022       3     2022-03 2021/2022         26      NA       NA
#> 37: total    2022       3     2022-03 2021/2022         26      NA       NA
#> 38: total    2022       3     2022-03 2021/2022         26      NA       NA
#> 39: total    2022       3     2022-03 2021/2022         26      NA       NA
#> 40: total    2022       3     2022-03 2021/2022         26      NA       NA
#> 41: total    2022       3     2022-03 2021/2022         26      NA       NA
#> 42: total    2022       3     2022-03 2021/2022         26      NA       NA
#> 43: total    2022       3     2022-03 2021/2022         26      NA       NA
#> 44: total    2022       3     2022-03 2021/2022         26      NA       NA
#> 45: total    2022       3     2022-03 2021/2022         26      NA       NA
#>       sex isoyear isoweek isoyearweek    season seasonweek calyear calmonth
#>     calyearmonth       date deaths_n
#>  1:         <NA> 2022-01-23        4
#>  2:         <NA> 2022-01-23        4
#>  3:         <NA> 2022-01-23        8
#>  4:         <NA> 2022-01-23        3
#>  5:         <NA> 2022-01-23        4
#>  6:         <NA> 2022-01-23        4
#>  7:         <NA> 2022-01-23        7
#>  8:         <NA> 2022-01-23        3
#>  9:         <NA> 2022-01-23        6
#> 10:         <NA> 2022-01-23       10
#> 11:         <NA> 2022-01-23        5
#> 12:         <NA> 2022-01-23        5
#> 13:         <NA> 2022-01-23        4
#> 14:         <NA> 2022-01-23        4
#> 15:         <NA> 2022-01-23        6
#> 16:         <NA> 2022-01-23        4
#> 17:         <NA> 2022-01-23        4
#> 18:         <NA> 2022-01-23        8
#> 19:         <NA> 2022-01-23        3
#> 20:         <NA> 2022-01-23        4
#> 21:         <NA> 2022-01-23        4
#> 22:         <NA> 2022-01-23        7
#> 23:         <NA> 2022-01-23        3
#> 24:         <NA> 2022-01-23        6
#> 25:         <NA> 2022-01-23       10
#> 26:         <NA> 2022-01-23        5
#> 27:         <NA> 2022-01-23        5
#> 28:         <NA> 2022-01-23        4
#> 29:         <NA> 2022-01-23        4
#> 30:         <NA> 2022-01-23        6
#> 31:         <NA> 2022-01-23        4
#> 32:         <NA> 2022-01-23        4
#> 33:         <NA> 2022-01-23        8
#> 34:         <NA> 2022-01-23        3
#> 35:         <NA> 2022-01-23        4
#> 36:         <NA> 2022-01-23        4
#> 37:         <NA> 2022-01-23        7
#> 38:         <NA> 2022-01-23        3
#> 39:         <NA> 2022-01-23        6
#> 40:         <NA> 2022-01-23       10
#> 41:         <NA> 2022-01-23        5
#> 42:         <NA> 2022-01-23        5
#> 43:         <NA> 2022-01-23        4
#> 44:         <NA> 2022-01-23        4
#> 45:         <NA> 2022-01-23        6
#>     calyearmonth       date deaths_n

Smart assignment

csfmt_rts_data_v1 does smart assignment for time and geography.

When the variables in bold are assigned using :=, the listed variables will be automatically imputed.

location_code:

isoyear:

isoyearweek:

date:

d <- cstidy::generate_test_data()[1:5]
cstidy::set_csfmt_rts_data_v1(d)

# Looking at the dataset
d[]
#>    granularity_time granularity_geo country_iso3 location_code border  age  sex
#> 1:      isoyearweek          county          nor  county_nor42     NA <NA> <NA>
#> 2:      isoyearweek          county          nor  county_nor32     NA <NA> <NA>
#> 3:      isoyearweek          county          nor  county_nor33     NA <NA> <NA>
#> 4:      isoyearweek          county          nor  county_nor56     NA <NA> <NA>
#> 5:      isoyearweek          county          nor  county_nor34     NA <NA> <NA>
#>    isoyear isoweek isoyearweek    season seasonweek calyear calmonth
#> 1:    2022       3     2022-03 2021/2022         26      NA       NA
#> 2:    2022       3     2022-03 2021/2022         26      NA       NA
#> 3:    2022       3     2022-03 2021/2022         26      NA       NA
#> 4:    2022       3     2022-03 2021/2022         26      NA       NA
#> 5:    2022       3     2022-03 2021/2022         26      NA       NA
#>    calyearmonth       date deaths_n
#> 1:         <NA> 2022-01-23        8
#> 2:         <NA> 2022-01-23        7
#> 3:         <NA> 2022-01-23        6
#> 4:         <NA> 2022-01-23        2
#> 5:         <NA> 2022-01-23        7

# Smart assignment of time columns (note how granularity_time, isoyear, isoyearweek, date all change)
d[1,isoyearweek := "2021-01"]
d
#>    granularity_time granularity_geo country_iso3 location_code border  age  sex
#> 1:      isoyearweek          county          nor  county_nor42     NA <NA> <NA>
#> 2:      isoyearweek          county          nor  county_nor32     NA <NA> <NA>
#> 3:      isoyearweek          county          nor  county_nor33     NA <NA> <NA>
#> 4:      isoyearweek          county          nor  county_nor56     NA <NA> <NA>
#> 5:      isoyearweek          county          nor  county_nor34     NA <NA> <NA>
#>    isoyear isoweek isoyearweek    season seasonweek calyear calmonth
#> 1:    2021       1     2021-01 2020/2021         24      NA       NA
#> 2:    2022       3     2022-03 2021/2022         26      NA       NA
#> 3:    2022       3     2022-03 2021/2022         26      NA       NA
#> 4:    2022       3     2022-03 2021/2022         26      NA       NA
#> 5:    2022       3     2022-03 2021/2022         26      NA       NA
#>    calyearmonth       date deaths_n
#> 1:         <NA> 2021-01-10        8
#> 2:         <NA> 2022-01-23        7
#> 3:         <NA> 2022-01-23        6
#> 4:         <NA> 2022-01-23        2
#> 5:         <NA> 2022-01-23        7

# Smart assignment of time columns (note how granularity_time, isoyear, isoyearweek, date all change)
d[2,isoyear := 2019]
d
#>    granularity_time granularity_geo country_iso3 location_code border  age  sex
#> 1:      isoyearweek          county          nor  county_nor42     NA <NA> <NA>
#> 2:          isoyear          county          nor  county_nor32     NA <NA> <NA>
#> 3:      isoyearweek          county          nor  county_nor33     NA <NA> <NA>
#> 4:      isoyearweek          county          nor  county_nor56     NA <NA> <NA>
#> 5:      isoyearweek          county          nor  county_nor34     NA <NA> <NA>
#>    isoyear isoweek isoyearweek    season seasonweek calyear calmonth
#> 1:    2021       1     2021-01 2020/2021         24      NA       NA
#> 2:    2019      52     2019-52      <NA>         NA      NA       NA
#> 3:    2022       3     2022-03 2021/2022         26      NA       NA
#> 4:    2022       3     2022-03 2021/2022         26      NA       NA
#> 5:    2022       3     2022-03 2021/2022         26      NA       NA
#>    calyearmonth       date deaths_n
#> 1:         <NA> 2021-01-10        8
#> 2:         <NA> 2019-12-29        7
#> 3:         <NA> 2022-01-23        6
#> 4:         <NA> 2022-01-23        2
#> 5:         <NA> 2022-01-23        7

# Smart assignment of time columns (note how granularity_time, isoyear, isoyearweek, date all change)
d[4:5,date := as.Date("2020-01-01")]
d
#>    granularity_time granularity_geo country_iso3 location_code border  age  sex
#> 1:      isoyearweek          county          nor  county_nor42     NA <NA> <NA>
#> 2:          isoyear          county          nor  county_nor32     NA <NA> <NA>
#> 3:      isoyearweek          county          nor  county_nor33     NA <NA> <NA>
#> 4:             date          county          nor  county_nor56     NA <NA> <NA>
#> 5:             date          county          nor  county_nor34     NA <NA> <NA>
#>    isoyear isoweek isoyearweek    season seasonweek calyear calmonth
#> 1:    2021       1     2021-01 2020/2021         24      NA       NA
#> 2:    2019      52     2019-52      <NA>         NA      NA       NA
#> 3:    2022       3     2022-03 2021/2022         26      NA       NA
#> 4:    2020       1     2020-01 2019/2020         24    2020        1
#> 5:    2020       1     2020-01 2019/2020         24    2020        1
#>    calyearmonth       date deaths_n
#> 1:         <NA> 2021-01-10        8
#> 2:         <NA> 2019-12-29        7
#> 3:         <NA> 2022-01-23        6
#> 4:     2020-M01 2020-01-01        2
#> 5:     2020-M01 2020-01-01        7

# Smart assignment fails when multiple time columns are set
d[1,c("isoyear","isoyearweek") := .(2021,"2021-01")]
#> Warning in `[.csfmt_rts_data_v1`(d, 1, `:=`(c("isoyear", "isoyearweek"), :
#> Multiple time variables specified. Smart-assignment disabled.
d
#>    granularity_time granularity_geo country_iso3 location_code border  age  sex
#> 1:      isoyearweek          county          nor  county_nor42     NA <NA> <NA>
#> 2:          isoyear          county          nor  county_nor32     NA <NA> <NA>
#> 3:      isoyearweek          county          nor  county_nor33     NA <NA> <NA>
#> 4:             date          county          nor  county_nor56     NA <NA> <NA>
#> 5:             date          county          nor  county_nor34     NA <NA> <NA>
#>    isoyear isoweek isoyearweek    season seasonweek calyear calmonth
#> 1:    2021       1     2021-01 2020/2021         24      NA       NA
#> 2:    2019      52     2019-52      <NA>         NA      NA       NA
#> 3:    2022       3     2022-03 2021/2022         26      NA       NA
#> 4:    2020       1     2020-01 2019/2020         24    2020        1
#> 5:    2020       1     2020-01 2019/2020         24    2020        1
#>    calyearmonth       date deaths_n
#> 1:         <NA> 2021-01-10        8
#> 2:         <NA> 2019-12-29        7
#> 3:         <NA> 2022-01-23        6
#> 4:     2020-M01 2020-01-01        2
#> 5:     2020-M01 2020-01-01        7

# Smart assignment of geo columns
d[1,c("location_code") := .("norge")]
d
#>    granularity_time granularity_geo country_iso3 location_code border  age  sex
#> 1:      isoyearweek          nation          nor         norge     NA <NA> <NA>
#> 2:          isoyear          county          nor  county_nor32     NA <NA> <NA>
#> 3:      isoyearweek          county          nor  county_nor33     NA <NA> <NA>
#> 4:             date          county          nor  county_nor56     NA <NA> <NA>
#> 5:             date          county          nor  county_nor34     NA <NA> <NA>
#>    isoyear isoweek isoyearweek    season seasonweek calyear calmonth
#> 1:    2021       1     2021-01 2020/2021         24      NA       NA
#> 2:    2019      52     2019-52      <NA>         NA      NA       NA
#> 3:    2022       3     2022-03 2021/2022         26      NA       NA
#> 4:    2020       1     2020-01 2019/2020         24    2020        1
#> 5:    2020       1     2020-01 2019/2020         24    2020        1
#>    calyearmonth       date deaths_n
#> 1:         <NA> 2021-01-10        8
#> 2:         <NA> 2019-12-29        7
#> 3:         <NA> 2022-01-23        6
#> 4:     2020-M01 2020-01-01        2
#> 5:     2020-M01 2020-01-01        7

# Collapsing down to different levels, and healing the dataset 
# (so that it can be worked on further with regards to real time surveillance)
d[, .(deaths_n = sum(deaths_n), location_code = "norge"), keyby=.(granularity_time)] %>%
  cstidy::set_csfmt_rts_data_v1(create_unified_columns = FALSE) %>%
  print()
#>    granularity_time deaths_n location_code date
#> 1:             date        9         norge <NA>
#> 2:          isoyear        7         norge <NA>
#> 3:      isoyearweek       14         norge <NA>

# Collapsing to different levels, and removing the class csfmt_rts_data_v1 because
# it is going to be used in new output/analyses
d[, .(deaths_n = sum(deaths_n), location_code = "norge"), keyby=.(granularity_time)] %>%
  cstidy::remove_class_csfmt_rts_data() %>%
  print()
#>    granularity_time deaths_n location_code
#> 1:             date        9         norge
#> 2:          isoyear        7         norge
#> 3:      isoyearweek       14         norge

Summary

We need a way to easily summarize the data structure of a dataset.

cstidy::generate_test_data() %>%
  cstidy::set_csfmt_rts_data_v1() %>%
  summary()
#> 
#> granularity_time
#> ✅ No errors
#> 
#> granularity_geo
#> ✅ No errors
#> 
#> country_iso3
#> ✅ No errors
#> 
#> location_code
#> ✅ No errors
#> 
#> border
#> ❌ Errors:
#> - NA exists (not allowed)
#> 
#> age
#> ✅ No errors
#> 
#> sex
#> ✅ No errors
#> 
#> isoyear
#> ✅ No errors
#> 
#> isoweek
#> ✅ No errors
#> 
#> isoyearweek
#> ✅ No errors
#> 
#> season
#> ✅ No errors
#> 
#> seasonweek
#> ✅ No errors
#> 
#> calyear
#> ✅ No errors
#> 
#> calmonth
#> ✅ No errors
#> 
#> calyearmonth
#> ✅ No errors
#> 
#> date
#> ✅ No errors
#> granularity_time (character):
#>  - isoyearweek (n = 45)
#> granularity_geo (character):
#>  - county (n = 45)
#> country_iso3 (character):
#>  - nor (n = 45)
#> location_code (character)
#> border (integer):
#>  - <NA> (n = 45)
#> age (character):
#>  - 000_005 (n = 15)
#>  - <NA>    (n = 15)
#>  - total   (n = 15)
#> sex (character):
#>  - <NA>  (n = 15)
#>  - total (n = 30)
#> isoyear (integer):
#>  - 2022 (n = 45)
#> isoweek (integer)
#> isoyearweek (character)
#> season (character):
#>  - 2021/2022 (n = 45)
#> seasonweek (numeric)
#> calyear (integer)
#> calmonth (integer)
#> calyearmonth (character)
#> date (Date)
#> deaths_n (integer)

Identifying data structure of one column

We need a way to easily summarize the data structure of one column inside a dataset.

cstidy::generate_test_data() %>%
  cstidy::set_csfmt_rts_data_v1() %>%
  cstidy::identify_data_structure("deaths_n") %>%
  plot()