Age Stratification

statistical inference

Post modeling analysis for naive age stratification

Published

June 13, 2023

Overview

This analysis utilizes FluSurv data to first estimate weekly and age class specific influenza rates for the period 2010-2023, then second, to apply estimated rates to flusion data to approximate date, location, and age group specific influenza hospitalizations. Because FluSurv only provides data for 18 states, rate modeling includes covariates to account for time and location sampling biases within a time series model.

Libraries

Hide code

#wrangling
library(tidyverse)
library(lubridate)
library(data.table, include.only = "fread")
library(cdcfluview)

#spatial
library(CovidCAR)

#inference
library(INLA)

options(dplyr.summarise.inform = FALSE, show_col_types = FALSE)

Age Tables

Creating a crosswalk to match FluSurv age groups to those in US Census data. US Census data is used as a random slopes type predictor in the time series model to provide age-specfic offsets.

Note: Census codes were queried using the censusapi package. This work is shown in the non-rendered version of this document in the GitHub repository.

Hide code

demog_table  <- as.data.frame(
  cbind(
    census_classes = c(1, 6:18, 20, 21, 23),
    flsv_classes = c("0-4 yr", "18-29 yr", "30-39 yr", "30-39 yr", "40-49 yr", "40-49 yr", "50-64 yr",
                     "50-64 yr", "50-64 yr", "65-74 yr", "65-74 yr", "75-84 yr", "75-84 yr", "85+",
                     "5-17 yr", "5-17 yr", "18-29 yr")
  )
)

demog_table

census_classes	flsv_classes
1	0-4 yr
6	18-29 yr
7	30-39 yr
8	30-39 yr
9	40-49 yr
10	40-49 yr
11	50-64 yr
12	50-64 yr
13	50-64 yr
14	65-74 yr
15	65-74 yr
16	75-84 yr
17	75-84 yr
18	85+
20	5-17 yr
21	5-17 yr
23	18-29 yr

FluSurv Data

The hospitalizations() function from the cdcfluview package does most of the work by querying FluView, but seems only to be able to take small bites at a time.

Hide code

myRegions <- surveillance_areas() 

flusurv <- do.call(rbind, lapply(seq_len(dim(myRegions)[1]), function(i) {
  hospitalizations(surveillance_area = myRegions$surveillance_area[i], region = myRegions$region[i])
}))

#save copy
#write.csv(flusurv, "./2033-06-01-data/flusurv_query.csv", row.names = FALSE)
#write.csv(flusurv, "./2033-06-01-data/flusurv_all.csv", row.names = FALSE) #Entire Network

Hide code

flusurv <- fread("D:/Github/flusion/data/flusurv_all.csv") %>%
  filter(age_label %in% unique(demog_table$flsv_classes), #age classes
         region != "Entire Network", #states only
         year >= 2010 & year <= 2019) %>% #the pkg fails on dates after 2020,ugh
  mutate(age_class = age_label,
         location_name = region,
         network = surveillance_area,
         weeklyrate = as.numeric(weeklyrate),
         epiweek = year_wk_num) %>%
    select(location_name, year, epiweek, network, weeklyrate, age_class)

#manual download from site 2023-06-01
flusurv_2020 <- fread("D:/Github/flusion/data/FluSurveillance_2020.csv") %>%
  rename_all(~gsub(" |-", "", .)) %>%
  filter(AGECATEGORY %in% unique(demog_table$flsv_classes), #ages to keep
         SEXCATEGORY == "Overall", #all sexes
         RACECATEGORY == "Overall",
         WEEKLYRATE != "null",
         CATCHMENT != "Entire Network", #states only
         MMWRYEAR >= 2020) %>% #the pkg fails on dates after 2020,ugh
  mutate(age_class = AGECATEGORY,
         location_name = CATCHMENT,
         network = NETWORK,
         year = MMWRYEAR,
         epiweek = MMWRWEEK,
         rate = CUMULATIVERATE,
         weeklyrate = as.numeric(WEEKLYRATE)) %>%
    select(location_name, year, epiweek, network, weeklyrate, age_class)

#Join date ranges
flusurv = rbind(flusurv, flusurv_2020)
flusurv$weeklyrate.s <- log(flusurv$weeklyrate + 0.0001)

flusurv$location_name[flusurv$location_name == "New York - Albany"] = "New York"
flusurv$location_name[flusurv$location_name == "New York - Rochester"] ="New York"

flusurv <- as.data.frame(
  flusurv %>%
  group_by(location_name, year, epiweek, age_class) %>%
  summarise(weeklyrate = mean(weeklyrate, na.rm=T),
            weeklyrate.s = mean(weeklyrate.s, na.rm=T))
)


unique(duplicated(flusurv))

[1] FALSE

Hide code

head(flusurv)

location_name	year	epiweek	age_class	weeklyrate	weeklyrate.s
California	2010	1	0-4 yr	0.5	-0.6929472
California	2010	1	18-29 yr	0.3	-1.4972664
California	2010	1	30-39 yr	0.4	-0.9160408
California	2010	1	40-49 yr	0.0	-9.2103404
California	2010	1	5-17 yr	0.2	-1.6089380
California	2010	1	50-64 yr	0.5	-0.6929472

Demographic Data

US population estimates by state and age group from the U.S. Census Bureau

Hide code

all_pop <- censusapi::getCensus(key = get_api("censusapi"),
                                name = "pep/charagegroups",
                                vars = c('AGEGROUP','POP'),
                                vintage = "2018",
                                region = "state:*") %>%
  filter(AGEGROUP %in% unique(demog_table$census_classes)) %>%
  mutate(location = state,
         pop_age = POP,
         census_classes = AGEGROUP) %>%
  select(-c(state, AGEGROUP, POP))

all_pop$age_class <- with(demog_table,
                       flsv_classes[match(
                           all_pop$census_classes,
                                   census_classes)])


all_pop <- as.data.frame(
  all_pop %>%
  group_by(location, age_class) %>%
  summarise(pop_age = sum(pop_age))
)

Read FIPS codes

State FIPS numbers to match Census and FluSurv data. This file was created during Preprocessing.

Hide code

locations <- fread("D:/Github/flusion/data/locations.csv") %>%
  select(-c(count_rate1per100k, count_rate2per100k)) %>%
  filter(location_name != "US") #

Match FIPS to data.

Hide code

all_pop <- left_join(all_pop, locations, by = "location") %>%
  filter(location_name %in% unique(flusurv$location_name))

unique(duplicated(flusurv[,c("location_name", "age_class", "epiweek","year")]))

[1] FALSE

Hide code

comb_data <- left_join(flusurv, all_pop, by = c("location_name", "age_class"))

#date
comb_data$date <- MMWRweek::MMWRweek2Date(comb_data$year, comb_data$epiweek, 7)

Expand Dates

FluSurv reports are not available for many dates within the analysis period, partculary outside of the primary flu season. Because the goal is to estimate continuous rate variation, a data frame is created listing all dates is created.

Get Fusion Data

Need current date range in flusion data so that results here can be later matched. Flusion_v1 does not include age structure.

Hide code

#function to downlaod file
get_data <- function(url) {
  df <- read_csv(url)
  return(df)
}

flusion_url <- "https://github.com/JMHumphreys/flusion/raw/main/flusion/flusion_v1.csv"
flusion <- get_data(flusion_url)

head(flusion)

date	year	epiweek	abbreviation	location	location_name	q_0.025	q_0.25	q_0.50	q_0.75	q_0.975
2010-10-09	2010	40	AL	01	Alabama	0.13	0.43	0.80	1.48	4.83
2010-10-09	2010	40	AK	02	Alaska	0.04	0.14	0.26	0.48	1.58
2010-10-09	2010	40	AZ	04	Arizona	2.68	8.71	16.12	29.72	93.15
2010-10-09	2010	40	AR	05	Arkansas	0.08	0.25	0.47	0.87	2.82
2010-10-09	2010	40	CA	06	California	1.81	5.89	10.93	20.24	64.55
2010-10-09	2010	40	CO	08	Colorado	1.12	3.65	6.77	12.55	40.19

Create a date vector to capture the entire date range for each age class.

Hide code

min_date <- min(flusion$date) #flusion date
max_date <- max(comb_data$date) #FluSurv date
date_range <- seq(min_date, max_date, by = "1 week")

comb_data <- comb_data %>%
  filter(date >= min_date & date <= max_date)

flusion_cut <- flusion %>%
  filter(date >= min_date & date <= max_date)

date_age_tab = as.data.frame(rep(unique(demog_table$flsv_classes), length(date_range)))
names(date_age_tab) = "age_class"
date_age_tab$date <- rep(unique(date_range), length(unique(demog_table$flsv_classes)))


grid_expand <- left_join(date_age_tab, comb_data, by = c("date", "age_class"))
grid_expand$location_name = ifelse(is.na(grid_expand$location_name) == TRUE, "unknown", grid_expand$location_name)

#FluSurv values were transformed to log scale  
grid_expand$weeklyrate.s <- ifelse(is.na(grid_expand$weeklyrate.s), log(0.0001), grid_expand$weeklyrate.s)

Organize Data

Verifying that all variables needed for modeling are present.

Hide code

grid_expand <- grid_expand %>%
  mutate(intercept = 1, 
         ts_weeks.1 = as.integer(as.factor(date)),
         ts_weeks.2 = ts_weeks.1,
         region.1 = as.integer(as.factor(location_name)),
         class.1 = as.integer(as.factor(age_class)),
         class.2 = class.1,
         class.3 = class.1,
         year = year(date),
         year.int = as.integer(as.factor(year)),
         l_age_pop = pop_age/10^5)

# find average population across FluSurv's 18 states
pop_state_age <- comb_data %>%
  group_by(age_class) %>%
  summarise(mean_age_pop = mean(pop_age)/10^5) #rate/100k

grid_expand <- left_join(grid_expand, pop_state_age, by = c("age_class"))
grid_expand$l_age_pop <- ifelse(grid_expand$location_name == "unknown", grid_expand$mean_age_pop, grid_expand$l_age_pop)

Model

Hide code

#prior
pc.prior = list(prec = list(prior="pc.prec", #prior
                            param = c(1, 0.5)))

#formula
form_age <- weeklyrate.s ~ -1 + #log-rate per 100k
  f(ts_weeks.1,  #time series model           
    constr=TRUE, #rw models perform well also 
    model="iid", #iid more conservative
    group = class.1, #age group specific
    control.group=list(model="iid"),
    hyper= pc.prior) +
  f(class.2,   #random intercepts for age-specific means
    constr=TRUE,
    model="iid",
    hyper=pc.prior) +
  f(class.3, l_age_pop, #~random slopes for pop      
   constr=TRUE,
    model="iid",
    hyper=pc.prior) +
  f(region.1,   #region level variation     
    constr=TRUE,
    model="iid",
    group = year.int, #year level variation
    control.group=list(model="iid"),
    hyper=pc.prior) 

#run model
age.mod = inla(form_age, #formula
               data = grid_expand, #data 
               family = c("gaussian"), #negative binomial
               verbose = FALSE, 
               quantiles = c(0.025, 0.25, 0.5, 0.75, 0.975),
               control.fixed = list(prec = 1, 
                                    prec.intercept = 1), 
               control.predictor = list(
                                  compute = TRUE, 
                                     link = 1), 
               control.inla = list(strategy = "gaussian", 
                              int.strategy = "eb"), 
               control.compute=list(dic = F, cpo = F, waic = F, config=F))


#extract and scale results
fitted_rates <- age.mod$summary.fitted.values[,3:7]
names(fitted_rates) <- c("q0.025", "q0.25","q0.5","q0.75","q0.975")
fitted_rates <- apply(fitted_rates, 2, exp)

2010-2023 Weekly Rates

View period of record rates.

Hide code

esti_rates <- cbind(grid_expand, fitted_rates)

esti_rates <- esti_rates %>%
  group_by(date, age_class) %>%
  summarise(q0.025 = median(q0.025),
            q0.25 = median(q0.25),
            q0.5 = median(q0.5),
            q0.75 = median(q0.75),
            q0.975 = median(q0.975))

ggplot(esti_rates, aes(date, q0.5,  group=age_class, col=age_class), fill=age_class) +
   geom_line(linewidth = 1) +
   viridis::scale_color_viridis("Age Group",
                             discrete=T,
                             option = "turbo",
                             direction = -1,
                             na.value = "white") +
    scale_x_date(date_breaks = "6 month", date_labels = "%b-%d-%Y") +
    scale_y_continuous(breaks = scales::pretty_breaks(8), limits = c(0, 40)) +
    xlab(" ") +
    ylab("Rate per 100,000") + 
    ggtitle("Influenza Hospitalization") +
    theme_classic() +
    theme(plot.margin = unit(c(1,0.5,1,0.5), "cm"),
      panel.grid.minor = element_blank(),
      panel.grid.major = element_blank(),
      panel.background = element_blank(),
      plot.background = element_blank(),
      panel.border = element_blank(),
      legend.title = element_text(size = 16, face = "bold", hjust=0.5),
      legend.text = element_text(size=10, face="bold"),
      strip.text = element_text(size=16, face="bold"),
      strip.background = element_blank(),
      legend.position = "bottom",
      legend.direction = "horizontal",
      legend.key.width = unit(2,"line"),
      axis.text.y = element_text(face="bold", size=12),
      axis.text.x = element_text(face="bold", size=10, angle = 60, hjust=1),
      axis.title.x = element_text(size=18, face="bold"),
      axis.title.y = element_text(size=18, face="bold"),
      plot.title = element_text(size=25, face="bold", hjust=0.5))

2022-2023 Weekly Rates

Zoom in to view most recent season. Snapshot for same period provided by FluSurv is below for comparison.

Hide code

xmin = as_date("2022-10-08")
xmax = as_date("2023-04-29")

ggplot(esti_rates, aes(date, q0.5,  group=age_class, col=age_class), fill=age_class) +
   geom_line(linewidth = 1) +
   geom_point(size=1.5, pch=19) +
   geom_hline(yintercept = 0, 
                linetype = "solid",
                   colour = "gray60",
                   linewidth = 0.75) +
  viridis::scale_color_viridis("Age Group",
                             discrete=T,
                             option = "turbo",
                             direction = -1,
                             na.value = "white") +
    scale_x_date(date_breaks = "2 week", date_labels = "%b-%d-%Y", limits = c(xmin,xmax)) +
    scale_y_continuous(breaks = scales::pretty_breaks(8), limits = c(0, 40)) +
    xlab(" ") +
    ylab("Rate per 100,000") + 
    ggtitle("Influenza Hospitalization") +
    theme_classic() +
    theme(plot.margin = unit(c(2,0.5,2,0.5), "cm"),
      panel.grid.minor = element_blank(),
      panel.grid.major = element_blank(),
      panel.background = element_blank(),
      plot.background = element_blank(),
      panel.border = element_blank(),
      legend.title = element_text(size = 16, face = "bold", hjust=0.5),
      legend.text = element_text(size=10, face="bold"),
      strip.text = element_text(size=16, face="bold"),
      strip.background = element_blank(),
      legend.position = c(0.7, 0.5),
      legend.direction = "vertical",
      legend.key.width = unit(2,"line"),
      axis.text.y = element_text(face="bold", size=14),
      axis.text.x = element_text(face="bold", size=14, angle = 60, hjust=1),
      axis.title.x = element_text(size=22, face="bold"),
      axis.title.y = element_text(size=22, face="bold"),
      plot.title = element_text(size=25, face="bold", hjust=0.5))

Warning: Removed 5634 rows containing missing values (`geom_line()`).

Warning: Removed 5634 rows containing missing values (`geom_point()`).

Compare the estimated values to those on FluSurv Image from FluSurv