Use DrugUtilisation to create cohort

Create mock data first

library(DrugUtilisation)
library(CodelistGenerator)
library(CDMConnector)
library(dplyr)
con <- DBI::dbConnect(duckdb::duckdb(), ":memory:")
connectionDetails <- list(
  con = con,
  writeSchema = "main",
  cdmPrefix = NULL,
  writePrefix = NULL
)
cdm <- mockDrugUtilisation(
  connectionDetails = connectionDetails,
  numberIndividual = 100
)

Get concept code for cohort generation

To generate a cohort, we will need a concept list, this can be obtained through different ways.

Get from json file.
Use concept code directly.
Get ingredient
Get ATC code

To get it from json file, both function readConceptList() and codesFromConceptSet() can be used.

#get concept from json file using readConceptList from this package or CodelistGenerator
conceptSet_json_1 <- readConceptList(here::here("inst/Concept"), cdm)
conceptSet_json_2 <- codesFromConceptSet(here::here("inst/Concept"), cdm)

conceptSet_json_1
#> $asthma
#> [1] 317009
conceptSet_json_2
#> $asthma
#> [1] 317009

Or a list can be created manually with the target codes:

#get concept using code directly
conceptSet_code <- list(asthma = 317009)
conceptSet_code
#> $asthma
#> [1] 317009

If there is a certain ingredient of interest, code can also be obtained by getDrugIngredientCodes() from CodelistGenerator.

#get concept by ingredient
conceptSet_ingredient <- getDrugIngredientCodes(cdm, name = "simvastatin")
conceptSet_ingredient
#> $simvastatin
#> [1] 1539403 1539462 1539463

ATC code can also be obtained, using getATCCodes() from CodelistGenerator.

#get concept from ATC codes
conceptSet_ATC <- getATCCodes(cdm, 
                              level = "ATC 1st", 
                              name = "ALIMENTARY TRACT AND METABOLISM")
conceptSet_ATC
#> $alimentary_tract_and_metabolism
#> [1] 35897399

Create cohort

Now having the conceptSet, we can proceed to generate cohort. There are two functions in this package to generate cohort:

generateConceptCohortSet: to generate a cohort for a certain list of concepts, which does not have to be drug. This function is exported from CDMConnector
generateDrugUtilisationCohortSet: to generate a cohort of the drug use

generateConceptCohortSet()

First, let’s use generateConceptCohortSet to get the asthma cohort using the conceptSet_code, it will also give the same output if changed to conceptSet_json_1 or conceptSet_json_2, as they are using the same concept code.

cdm <- generateConceptCohortSet(cdm,
  conceptSet = conceptSet_code,
  name = "asthma_1",
  end = "observation_period_end_date",
  requiredObservation = c(10, 10),
  overwrite = TRUE
)
cdm$asthma_1
#> # Source:   table<asthma_1> [?? x 4]
#> # Database: DuckDB v0.9.2 [unknown@Linux 6.2.0-1019-azure:R 4.3.2/:memory:]
#>    cohort_definition_id subject_id cohort_start_date cohort_end_date
#>                   <int>      <int> <date>            <date>         
#>  1                    1         48 1973-11-26        2006-01-10     
#>  2                    1         77 1967-10-10        1967-12-11     
#>  3                    1         94 2002-12-23        2004-09-30     
#>  4                    1         14 1966-03-30        1990-10-18     
#>  5                    1         99 2020-01-31        2022-02-26     
#>  6                    1         54 2016-04-09        2021-02-24     
#>  7                    1         96 2016-05-19        2018-11-10     
#>  8                    1          8 2020-04-22        2021-11-01     
#>  9                    1         57 2021-02-19        2021-03-10     
#> 10                    1         58 2005-03-29        2012-07-14     
#> # ℹ more rows

The count of the cohort can be assessed using cohortCount() from CDMConnector

cohortCount(cdm$asthma_1)
#> # A tibble: 1 × 3
#>   cohort_definition_id number_records number_subjects
#>                  <int>          <int>           <int>
#> 1                    1             45              45

Cohort attrition can be assessed using cohortAttrition() from CDMConnector

cohortAttrition(cdm$asthma_1)
#> # A tibble: 1 × 7
#>   cohort_definition_id number_records number_subjects reason_id reason          
#>                  <int>          <int>           <int>     <int> <chr>           
#> 1                    1             45              45         1 Initial qualify…
#> # ℹ 2 more variables: excluded_records <int>, excluded_subjects <int>

The end parameter set how the cohort end date is defined. Now it is changed to event end date to demonstrate the difference from previous observation period end date. See that now the cohort_end_date is different:

cdm <- generateConceptCohortSet(cdm,
  conceptSet = conceptSet_code,
  name = "asthma_2",
  end = "event_end_date",
  requiredObservation = c(10, 10),
  overwrite = TRUE
)
cdm$asthma_2
#> # Source:   table<asthma_2> [?? x 4]
#> # Database: DuckDB v0.9.2 [unknown@Linux 6.2.0-1019-azure:R 4.3.2/:memory:]
#>    cohort_definition_id subject_id cohort_start_date cohort_end_date
#>                   <int>      <int> <date>            <date>         
#>  1                    1         92 2012-12-12        2013-01-21     
#>  2                    1         97 1990-06-26        1990-10-19     
#>  3                    1         58 2005-03-29        2007-08-15     
#>  4                    1         10 2021-08-16        2021-12-14     
#>  5                    1         18 2001-01-11        2004-06-05     
#>  6                    1         34 2008-12-23        2011-12-17     
#>  7                    1         49 2017-10-22        2018-03-31     
#>  8                    1         93 2015-05-31        2016-04-07     
#>  9                    1          1 1996-12-22        1998-03-13     
#> 10                    1         12 2019-03-06        2021-12-09     
#> # ℹ more rows

The requiredObservation parameter is a numeric vector of length 2, that defines the number of days of required observation time prior to index and post index for an event to be included in the cohort. Let’s check it now to see how reducing required observation affect the asthma_1 cohort.

cdm <- generateConceptCohortSet(cdm,
  conceptSet = conceptSet_code,
  name = "asthma_3",
  end = "observation_period_end_date",
  requiredObservation = c(1, 1),
  overwrite = TRUE
)
cdm$asthma_3
#> # Source:   table<asthma_3> [?? x 4]
#> # Database: DuckDB v0.9.2 [unknown@Linux 6.2.0-1019-azure:R 4.3.2/:memory:]
#>    cohort_definition_id subject_id cohort_start_date cohort_end_date
#>                   <int>      <int> <date>            <date>         
#>  1                    1         87 2019-06-24        2022-11-05     
#>  2                    1         95 2016-04-09        2022-05-13     
#>  3                    1         41 2005-07-25        2006-11-18     
#>  4                    1         73 2009-10-02        2011-08-12     
#>  5                    1         20 2001-09-16        2014-10-05     
#>  6                    1         68 2021-08-21        2022-11-01     
#>  7                    1          5 1978-07-01        1995-01-31     
#>  8                    1         26 2018-06-25        2020-10-21     
#>  9                    1         66 2017-11-24        2019-11-09     
#> 10                    1         24 2022-06-06        2022-07-04     
#> # ℹ more rows

cohortCount(cdm$asthma_3)
#> # A tibble: 1 × 3
#>   cohort_definition_id number_records number_subjects
#>                  <int>          <int>           <int>
#> 1                    1             48              48

cohortAttrition(cdm$asthma_3)
#> # A tibble: 1 × 7
#>   cohort_definition_id number_records number_subjects reason_id reason          
#>                  <int>          <int>           <int>     <int> <chr>           
#> 1                    1             48              48         1 Initial qualify…
#> # ℹ 2 more variables: excluded_records <int>, excluded_subjects <int>

generateDrugUtilisationCohortSet()

Now let’s try function DrugUtilisation::generateDrugUtilisationCohortSet() to get the drug cohort for ingredient simvastatin. This function has a lot more options you can set. We first use default settings:

cdm <- generateDrugUtilisationCohortSet(cdm,
  name = "dus_alleras",
  conceptSet = conceptSet_ingredient
)
cdm$dus_alleras
#> # Source:   table<dus_alleras> [?? x 4]
#> # Database: DuckDB v0.9.2 [unknown@Linux 6.2.0-1019-azure:R 4.3.2/:memory:]
#>    cohort_definition_id subject_id cohort_start_date cohort_end_date
#>                   <int>      <int> <date>            <date>         
#>  1                    1         89 2016-12-24        2017-02-04     
#>  2                    1         94 2002-12-09        2003-01-24     
#>  3                    1         98 2020-08-16        2020-08-19     
#>  4                    1         17 1997-01-12        1998-11-02     
#>  5                    1         79 2017-03-02        2019-06-02     
#>  6                    1         93 2013-11-22        2015-04-05     
#>  7                    1         50 2017-05-11        2017-05-20     
#>  8                    1         19 2022-05-26        2022-05-26     
#>  9                    1         23 1998-10-24        2002-12-12     
#> 10                    1         72 2010-01-16        2010-02-13     
#> # ℹ more rows

cohortCount(cdm$dus_alleras)
#> # A tibble: 1 × 3
#>   cohort_definition_id number_records number_subjects
#>                  <int>          <int>           <int>
#> 1                    1             53              48

cohortAttrition(cdm$dus_alleras) %>% select(number_records, reason, excluded_records, excluded_subjects)
#> # A tibble: 1 × 4
#>   number_records reason                    excluded_records excluded_subjects
#>            <int> <chr>                                <int>             <int>
#> 1             53 Initial qualifying events                0                 0

imputeDuration and durationRange

The parameter durationRange specifies the range within which the duration must fall, where duration = end date - start date + 1. Default as c(1, Inf). It should be a numeric vector of length two, with no NAs and the first value should be equal or smaller than the second one. Duration values outside of durationRange will be imputed using imputeDuration. It can ne set as: “none”, “median”, “mean”, “mode” or an integer (count).

cdm <- generateDrugUtilisationCohortSet(cdm,
  name = "dus_step2_0_inf",
  conceptSet = conceptSet_ingredient,
  imputeDuration = "none",
  durationRange = c(0, Inf) # default as c(1, Inf)
)

cohortAttrition(cdm$dus_step2_0_inf) %>% select(number_records, reason, excluded_records, excluded_subjects)
#> # A tibble: 1 × 4
#>   number_records reason                    excluded_records excluded_subjects
#>            <int> <chr>                                <int>             <int>
#> 1             53 Initial qualifying events                0                 0

gapEra

The gapEra parameter defines the number of days between two continuous drug exposures to be considered as a same era. Now let’s change it from 0 to a larger number. From the dus_step3_alleras cohort attrition, we can see that when joining era at STEP 3, it resulted in less records, compared to the dus_step2_0_inf cohort, as exposures with less than 30 days gaps are joined.

cdm <- generateDrugUtilisationCohortSet(cdm,
  name = "dus_step3_alleras",
  conceptSet = conceptSet_ingredient,
  imputeDuration = "none",
  durationRange = c(0, Inf),
  gapEra = 30 # default as 0
)

cohortAttrition(cdm$dus_step3_alleras) %>% select(number_records, reason, excluded_records, excluded_subjects)
#> # A tibble: 2 × 4
#>   number_records reason                       excluded_records excluded_subjects
#>            <int> <chr>                                   <int>             <int>
#> 1             53 Initial qualifying events                   0                 0
#> 2             51 join exposures separated by…                2                 0

priorUseWashout

The priorUseWashout parameter specifiesthe number of prior days without exposure (often termed a ‘washout’) that are required. By default, it is set to NULL, meaning no washout period is necessary. In the example provided, we observe a reduction in the number of records in STEP 4 for cohort dus_alleras_step4 due to the washout period required, compared to the dus_step3_alleras cohort.

cdm <- generateDrugUtilisationCohortSet(cdm,
  name = "dus_alleras_step4",
  conceptSet = conceptSet_ingredient,
  imputeDuration = "none",
  durationRange = c(0, Inf),
  gapEra = 30,
  priorUseWashout = 30
)

cohortAttrition(cdm$dus_alleras_step4) %>% select(number_records, reason, excluded_records, excluded_subjects)
#> # A tibble: 3 × 4
#>   number_records reason                       excluded_records excluded_subjects
#>            <int> <chr>                                   <int>             <int>
#> 1             53 Initial qualifying events                   0                 0
#> 2             51 join exposures separated by…                2                 0
#> 3             51 require prior use washout o…                0                 0

priorObservation

The parameter priorObservation defines the minimum number of days of prior observation necessary for drug eras to be taken into account. If set to NULL, the drug eras are not required to fall within the observation_period. In this example, there is a noticeable decrease in the number of records for dus_alleras_step5 cohort in STEP 5 when compared to the dus_alleras_step4 cohort.

cdm <- generateDrugUtilisationCohortSet(cdm,
  name = "dus_alleras_step5",
  conceptSet = conceptSet_ingredient,
  imputeDuration = "none",
  durationRange = c(0, Inf),
  gapEra = 30,
  priorUseWashout = 30,
  priorObservation = 30
)

cohortAttrition(cdm$dus_alleras_step5) %>% select(number_records, reason, excluded_records, excluded_subjects)
#> # A tibble: 4 × 4
#>   number_records reason                       excluded_records excluded_subjects
#>            <int> <chr>                                   <int>             <int>
#> 1             53 Initial qualifying events                   0                 0
#> 2             51 join exposures separated by…                2                 0
#> 3             51 require prior use washout o…                0                 0
#> 4             48 require at least 30 prior o…                3                 2

cohortDateRange

The cohortDateRange parameter defines the range for the cohort_start_date and cohort_end_date. In the following example, one can observe a reduction in STEP 6 and STEP 7 due to the constraints imposed on the cohort start and end dates.

cdm <- generateDrugUtilisationCohortSet(cdm,
  name = "dus_alleras_step67",
  conceptSet = conceptSet_ingredient,
  imputeDuration = "none",
  durationRange = c(0, Inf),
  gapEra = 30,
  priorUseWashout = 30,
  priorObservation = 30,
  cohortDateRange = as.Date(c("2010-01-01", "2011-01-01")),
  limi = "All"
)

cohortAttrition(cdm$dus_alleras_step67) %>% select(number_records, reason, excluded_records, excluded_subjects)
#> # A tibble: 6 × 4
#>   number_records reason                       excluded_records excluded_subjects
#>            <int> <chr>                                   <int>             <int>
#> 1             53 Initial qualifying events                   0                 0
#> 2             51 join exposures separated by…                2                 0
#> 3             51 require prior use washout o…                0                 0
#> 4             48 require at least 30 prior o…                3                 2
#> 5             34 restrict cohort_start_date …               14                14
#> 6              4 restrict cohort_end_date on…               30                28

limit: First era that fulfills the criteria

Change the limit parameter from All to First and observe how it impacts the attrition of the dus_step8_firstera cohort in comparison to the dus_alleras_step67 cohort. The number of records decreased at STEP 8 because of the First limit. It gets the first record that fulfills all criteria.

cdm <- generateDrugUtilisationCohortSet(cdm,
  name = "dus_step8_firstera",
  conceptSet = conceptSet_ingredient,
  imputeDuration = "none",
  durationRange = c(0, Inf),
  gapEra = 30,
  priorUseWashout = 30,
  priorObservation = 30,
  cohortDateRange = as.Date(c("2010-01-01", "2011-01-01")),
  limit = "First"
)

cohortAttrition(cdm$dus_step8_firstera) %>% select(number_records, reason, excluded_records, excluded_subjects)
#> # A tibble: 7 × 4
#>   number_records reason                       excluded_records excluded_subjects
#>            <int> <chr>                                   <int>             <int>
#> 1             53 Initial qualifying events                   0                 0
#> 2             51 join exposures separated by…                2                 0
#> 3             51 require prior use washout o…                0                 0
#> 4             48 require at least 30 prior o…                3                 2
#> 5             34 restrict cohort_start_date …               14                14
#> 6              4 restrict cohort_end_date on…               30                28
#> 7              4 restric to first record                     0                 0

limit: First ever era

The parameter limit only allows All and First. The First value represents the first era that meets the criteria set by the parameters prior to limit. However, if the goal is to get the first-ever era, this can be achieved using this function too. Setting the following parameter will result in the first ever drug era:

cdm <- generateDrugUtilisationCohortSet(cdm,
  name = "dus_step8_firstever",
  conceptSet = conceptSet_ingredient,
  imputeDuration = "none",
  durationRange = c(0, Inf),
  gapEra = 0,
  priorUseWashout = Inf,
  priorObservation = 0,
  cohortDateRange = as.Date(c(NA, NA)),
  limit = "First"
)

DBI::dbDisconnect(con, shutdown = TRUE)

Constructing concept sets and generating various cohorts are the initial steps in conducting a drug utilization study. For further guidance on using getting more information like characteristics from these cohorts, please refer to the other vignettes.

Marti Catala, Mike Du, Yuchen Guo, Kim Lopez-Guell, Edward Burn, Xintong Li