#library(LongitudinalDummyDataGenerator)
devtools::load_all(".")
output_folder <- "../output_dummy_generator/"

Generate service sector dummy data

service_sector_data_version = "R11v1"
person_level_data_version="R11v1"
n_patients_minimum <- 5000
tictoc::tic()
generate_all_dummy_data_to_files(
  output_folder = output_folder,
  service_sector_data_version = service_sector_data_version,
  person_level_data_version = person_level_data_version,
  n_patients_minimum = n_patients_minimum,
  seed = 13,
  nTreaths=(parallel::detectCores() -2)
)
tictoc::toc()

Load data


file_name <- stringr::str_c("dummy_service_sector_", service_sector_data_version, ".txt" )
service_sector_data <- readr::read_tsv(
  file.path(output_folder, file_name), 
  col_types = readr::cols(.default = readr::col_character()))

file_name <- stringr::str_c("dummy_minimum_extended_", person_level_data_version, ".txt" )
minimum_extended <- readr::read_tsv(file.path(output_folder, file_name))
#> Rows: 5000 Columns: 22
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: "\t"
#> chr   (7): FINNGENID, SEX, SMOKE2, SMOKE3, SMOKE5, regionofbirthname, COHORT
#> dbl  (11): BL_YEAR, BL_AGE, HEIGHT, WEIGHT, regionofbirth, movedabroad, NUMB...
#> lgl   (3): HEIGHT_AGE, WEIGHT_AGE, SMOKE_AGE
#> date  (1): APPROX_BIRTH_DAY
#> 
#>  Use `spec()` to retrieve the full column specification for this data.
#>  Specify the column types or set `show_col_types = FALSE` to quiet this message.

Assess data

Patients overlap

library(tidyverse)
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#>  dplyr     1.1.2      readr     2.1.4
#>  forcats   1.0.0      stringr   1.5.0
#>  ggplot2   3.4.2      tibble    3.2.1
#>  lubridate 1.9.2      tidyr     1.3.0
#>  purrr     1.0.1     
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#>  dplyr::filter() masks stats::filter()
#>  dplyr::lag()    masks stats::lag()
#>  Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

bind_rows(
  service_sector_data |> distinct(FINNGENID) |> mutate(table="service_sector"),
  minimum_extended |> distinct(FINNGENID) |> mutate(table="minimum_extended")
) |> 
  group_by(FINNGENID) |> 
  summarise(Pathways = list(table)) |> 
  #
  ggplot(aes(x = Pathways)) +
  geom_bar() +
  ggupset::scale_x_upset()

Vocabularies over time

service_sector_data_vocab <- service_sector_data |> 
  mutate(
    vocabulary = dplyr::case_when(
      SOURCE %in% c("INPAT", "OUTPAT") ~ ICDVER,
      SOURCE %in% c("PRIM_OUT", "OPER_IN", "OPER_OUT") ~ stringr::str_extract(CATEGORY, "^[:upper:]+"),
      SOURCE == "REIMB" ~ ICDVER,
      SOURCE == "PURCH" ~ "purch",
      SOURCE == "CANC" ~ "canc",
      SOURCE == "DEATH" ~ ICDVER
    ), 
    event_year = lubridate::year(APPROX_EVENT_DAY)
  ) 
service_sector_data_vocab |> 
  count(SOURCE, vocabulary, event_year, name = "n_events") |> 
  #
  ggplot(aes(x=event_year, y=n_events, fill=vocabulary))+
  geom_bar(stat = "identity")+
  facet_grid(SOURCE~., scales = "free_y")+
    theme(legend.position="bottom")

Visit type code calss over time

service_sector_visits <- service_sector_data |> distinct(SOURCE, INDEX, .keep_all = TRUE)
service_sector_visits |> 
 mutate(
    VisitType_class = dplyr::case_when(
      SOURCE %in% c("INPAT", "OUTPAT", "OPER_IN", "OPER_OUT") & !is.na(CODE5) & is.na(CODE8) & is.na(CODE9) ~ "SOURCE | ServiceSector", 
      SOURCE %in% c("INPAT", "OUTPAT", "OPER_IN", "OPER_OUT") & !is.na(CODE5) & (!is.na(CODE8) | !is.na(CODE9)) ~ "SOURCE | ServiceSector AND SOURCE | Contact Type | Urgency", 
      SOURCE %in% c("INPAT", "OUTPAT", "OPER_IN", "OPER_OUT") & is.na(CODE5) & (!is.na(CODE8) | !is.na(CODE9)) ~ "SOURCE | Contact Type | Urgency", 
      SOURCE %in% c("PRIM_OUT") & (!is.na(CODE5) | !is.na(CODE6)) ~ "SOURCE | Contact Type | Service Type", 
      TRUE ~ "SOURCE"
    ), 
    event_year = lubridate::year(APPROX_EVENT_DAY)
  ) |> 
  count(SOURCE, VisitType_class, event_year, name = "n_visists") |> 
  #
  ggplot(aes(x=event_year, y=n_visists, fill=VisitType_class))+
  geom_bar(stat = "identity")+
  facet_grid(SOURCE~., scales = "free_y")+
    theme(legend.position="bottom", legend.direction="vertical")

Provider code vocabulary over time


service_sector_visits |> 
 mutate(
    Provider_vocabulary = dplyr::case_when(
      SOURCE %in% c("INPAT", "OUTPAT", "OPER_IN", "OPER_OUT") & !is.na(CODE6) ~ "MEDSPECfi", 
      SOURCE %in% c("PRIM_OUT") & !is.na(CODE7) ~ "ProfessionalCode", 
      TRUE ~ "NA"
    ), 
    event_year = lubridate::year(APPROX_EVENT_DAY)
  ) |> 
  count(SOURCE, Provider_vocabulary, event_year, name = "n_visists") |> 
  #
  ggplot(aes(x=event_year, y=n_visists, fill=Provider_vocabulary))+
  geom_bar(stat = "identity")+
  facet_grid(SOURCE~., scales = "free_y")+
    theme(legend.position="bottom")