Hide code
#wrangling
library(tidyverse)
library(lubridate)
library(data.table, include.only = "fread")
library(cdcfluview)
#spatial
library(CovidCAR)
#inference
library(INLA)
options(dplyr.summarise.inform = FALSE, show_col_types = FALSE)
This analysis utilizes FluSurv data to first estimate weekly and age class specific influenza rates for the period 2010-2023, then second, to apply estimated rates to flusion data to approximate date, location, and age group specific influenza hospitalizations. Because FluSurv only provides data for 18 states, rate modeling includes covariates to account for time and location sampling biases within a time series model.
#wrangling
library(tidyverse)
library(lubridate)
library(data.table, include.only = "fread")
library(cdcfluview)
#spatial
library(CovidCAR)
#inference
library(INLA)
options(dplyr.summarise.inform = FALSE, show_col_types = FALSE)
Creating a crosswalk to match FluSurv age groups to those in US Census data. US Census data is used as a random slopes type predictor in the time series model to provide age-specfic offsets.
Note: Census codes were queried using the censusapi package. This work is shown in the non-rendered version of this document in the GitHub repository.
<- as.data.frame(
demog_table cbind(
census_classes = c(1, 6:18, 20, 21, 23),
flsv_classes = c("0-4 yr", "18-29 yr", "30-39 yr", "30-39 yr", "40-49 yr", "40-49 yr", "50-64 yr",
"50-64 yr", "50-64 yr", "65-74 yr", "65-74 yr", "75-84 yr", "75-84 yr", "85+",
"5-17 yr", "5-17 yr", "18-29 yr")
)
)
demog_table
census_classes | flsv_classes |
---|---|
1 | 0-4 yr |
6 | 18-29 yr |
7 | 30-39 yr |
8 | 30-39 yr |
9 | 40-49 yr |
10 | 40-49 yr |
11 | 50-64 yr |
12 | 50-64 yr |
13 | 50-64 yr |
14 | 65-74 yr |
15 | 65-74 yr |
16 | 75-84 yr |
17 | 75-84 yr |
18 | 85+ |
20 | 5-17 yr |
21 | 5-17 yr |
23 | 18-29 yr |
The hospitalizations() function from the cdcfluview package does most of the work by querying FluView, but seems only to be able to take small bites at a time.
<- surveillance_areas()
myRegions
<- do.call(rbind, lapply(seq_len(dim(myRegions)[1]), function(i) {
flusurv hospitalizations(surveillance_area = myRegions$surveillance_area[i], region = myRegions$region[i])
}))
#save copy
#write.csv(flusurv, "./2033-06-01-data/flusurv_query.csv", row.names = FALSE)
#write.csv(flusurv, "./2033-06-01-data/flusurv_all.csv", row.names = FALSE) #Entire Network
<- fread("D:/Github/flusion/data/flusurv_all.csv") %>%
flusurv filter(age_label %in% unique(demog_table$flsv_classes), #age classes
!= "Entire Network", #states only
region >= 2010 & year <= 2019) %>% #the pkg fails on dates after 2020,ugh
year mutate(age_class = age_label,
location_name = region,
network = surveillance_area,
weeklyrate = as.numeric(weeklyrate),
epiweek = year_wk_num) %>%
select(location_name, year, epiweek, network, weeklyrate, age_class)
#manual download from site 2023-06-01
<- fread("D:/Github/flusion/data/FluSurveillance_2020.csv") %>%
flusurv_2020 rename_all(~gsub(" |-", "", .)) %>%
filter(AGECATEGORY %in% unique(demog_table$flsv_classes), #ages to keep
== "Overall", #all sexes
SEXCATEGORY == "Overall",
RACECATEGORY != "null",
WEEKLYRATE != "Entire Network", #states only
CATCHMENT >= 2020) %>% #the pkg fails on dates after 2020,ugh
MMWRYEAR mutate(age_class = AGECATEGORY,
location_name = CATCHMENT,
network = NETWORK,
year = MMWRYEAR,
epiweek = MMWRWEEK,
rate = CUMULATIVERATE,
weeklyrate = as.numeric(WEEKLYRATE)) %>%
select(location_name, year, epiweek, network, weeklyrate, age_class)
#Join date ranges
= rbind(flusurv, flusurv_2020)
flusurv $weeklyrate.s <- log(flusurv$weeklyrate + 0.0001)
flusurv
$location_name[flusurv$location_name == "New York - Albany"] = "New York"
flusurv$location_name[flusurv$location_name == "New York - Rochester"] ="New York"
flusurv
<- as.data.frame(
flusurv %>%
flusurv group_by(location_name, year, epiweek, age_class) %>%
summarise(weeklyrate = mean(weeklyrate, na.rm=T),
weeklyrate.s = mean(weeklyrate.s, na.rm=T))
)
unique(duplicated(flusurv))
[1] FALSE
head(flusurv)
location_name | year | epiweek | age_class | weeklyrate | weeklyrate.s |
---|---|---|---|---|---|
California | 2010 | 1 | 0-4 yr | 0.5 | -0.6929472 |
California | 2010 | 1 | 18-29 yr | 0.3 | -1.4972664 |
California | 2010 | 1 | 30-39 yr | 0.4 | -0.9160408 |
California | 2010 | 1 | 40-49 yr | 0.0 | -9.2103404 |
California | 2010 | 1 | 5-17 yr | 0.2 | -1.6089380 |
California | 2010 | 1 | 50-64 yr | 0.5 | -0.6929472 |
US population estimates by state and age group from the U.S. Census Bureau
<- censusapi::getCensus(key = get_api("censusapi"),
all_pop name = "pep/charagegroups",
vars = c('AGEGROUP','POP'),
vintage = "2018",
region = "state:*") %>%
filter(AGEGROUP %in% unique(demog_table$census_classes)) %>%
mutate(location = state,
pop_age = POP,
census_classes = AGEGROUP) %>%
select(-c(state, AGEGROUP, POP))
$age_class <- with(demog_table,
all_popmatch(
flsv_classes[$census_classes,
all_pop
census_classes)])
<- as.data.frame(
all_pop %>%
all_pop group_by(location, age_class) %>%
summarise(pop_age = sum(pop_age))
)
State FIPS numbers to match Census and FluSurv data. This file was created during Preprocessing.
<- fread("D:/Github/flusion/data/locations.csv") %>%
locations select(-c(count_rate1per100k, count_rate2per100k)) %>%
filter(location_name != "US") #
Match FIPS to data.
<- left_join(all_pop, locations, by = "location") %>%
all_pop filter(location_name %in% unique(flusurv$location_name))
unique(duplicated(flusurv[,c("location_name", "age_class", "epiweek","year")]))
[1] FALSE
<- left_join(flusurv, all_pop, by = c("location_name", "age_class"))
comb_data
#date
$date <- MMWRweek::MMWRweek2Date(comb_data$year, comb_data$epiweek, 7) comb_data
FluSurv reports are not available for many dates within the analysis period, partculary outside of the primary flu season. Because the goal is to estimate continuous rate variation, a data frame is created listing all dates is created.
Need current date range in flusion data so that results here can be later matched. Flusion_v1 does not include age structure.
#function to downlaod file
<- function(url) {
get_data <- read_csv(url)
df return(df)
}
<- "https://github.com/JMHumphreys/flusion/raw/main/flusion/flusion_v1.csv"
flusion_url <- get_data(flusion_url)
flusion
head(flusion)
date | year | epiweek | abbreviation | location | location_name | q_0.025 | q_0.25 | q_0.50 | q_0.75 | q_0.975 |
---|---|---|---|---|---|---|---|---|---|---|
2010-10-09 | 2010 | 40 | AL | 01 | Alabama | 0.13 | 0.43 | 0.80 | 1.48 | 4.83 |
2010-10-09 | 2010 | 40 | AK | 02 | Alaska | 0.04 | 0.14 | 0.26 | 0.48 | 1.58 |
2010-10-09 | 2010 | 40 | AZ | 04 | Arizona | 2.68 | 8.71 | 16.12 | 29.72 | 93.15 |
2010-10-09 | 2010 | 40 | AR | 05 | Arkansas | 0.08 | 0.25 | 0.47 | 0.87 | 2.82 |
2010-10-09 | 2010 | 40 | CA | 06 | California | 1.81 | 5.89 | 10.93 | 20.24 | 64.55 |
2010-10-09 | 2010 | 40 | CO | 08 | Colorado | 1.12 | 3.65 | 6.77 | 12.55 | 40.19 |
Create a date vector to capture the entire date range for each age class.
<- min(flusion$date) #flusion date
min_date <- max(comb_data$date) #FluSurv date
max_date <- seq(min_date, max_date, by = "1 week")
date_range
<- comb_data %>%
comb_data filter(date >= min_date & date <= max_date)
<- flusion %>%
flusion_cut filter(date >= min_date & date <= max_date)
= as.data.frame(rep(unique(demog_table$flsv_classes), length(date_range)))
date_age_tab names(date_age_tab) = "age_class"
$date <- rep(unique(date_range), length(unique(demog_table$flsv_classes)))
date_age_tab
<- left_join(date_age_tab, comb_data, by = c("date", "age_class"))
grid_expand $location_name = ifelse(is.na(grid_expand$location_name) == TRUE, "unknown", grid_expand$location_name)
grid_expand
#FluSurv values were transformed to log scale
$weeklyrate.s <- ifelse(is.na(grid_expand$weeklyrate.s), log(0.0001), grid_expand$weeklyrate.s) grid_expand
Verifying that all variables needed for modeling are present.
<- grid_expand %>%
grid_expand mutate(intercept = 1,
ts_weeks.1 = as.integer(as.factor(date)),
ts_weeks.2 = ts_weeks.1,
region.1 = as.integer(as.factor(location_name)),
class.1 = as.integer(as.factor(age_class)),
class.2 = class.1,
class.3 = class.1,
year = year(date),
year.int = as.integer(as.factor(year)),
l_age_pop = pop_age/10^5)
# find average population across FluSurv's 18 states
<- comb_data %>%
pop_state_age group_by(age_class) %>%
summarise(mean_age_pop = mean(pop_age)/10^5) #rate/100k
<- left_join(grid_expand, pop_state_age, by = c("age_class"))
grid_expand $l_age_pop <- ifelse(grid_expand$location_name == "unknown", grid_expand$mean_age_pop, grid_expand$l_age_pop) grid_expand
#prior
= list(prec = list(prior="pc.prec", #prior
pc.prior param = c(1, 0.5)))
#formula
<- weeklyrate.s ~ -1 + #log-rate per 100k
form_age f(ts_weeks.1, #time series model
constr=TRUE, #rw models perform well also
model="iid", #iid more conservative
group = class.1, #age group specific
control.group=list(model="iid"),
hyper= pc.prior) +
f(class.2, #random intercepts for age-specific means
constr=TRUE,
model="iid",
hyper=pc.prior) +
f(class.3, l_age_pop, #~random slopes for pop
constr=TRUE,
model="iid",
hyper=pc.prior) +
f(region.1, #region level variation
constr=TRUE,
model="iid",
group = year.int, #year level variation
control.group=list(model="iid"),
hyper=pc.prior)
#run model
= inla(form_age, #formula
age.mod data = grid_expand, #data
family = c("gaussian"), #negative binomial
verbose = FALSE,
quantiles = c(0.025, 0.25, 0.5, 0.75, 0.975),
control.fixed = list(prec = 1,
prec.intercept = 1),
control.predictor = list(
compute = TRUE,
link = 1),
control.inla = list(strategy = "gaussian",
int.strategy = "eb"),
control.compute=list(dic = F, cpo = F, waic = F, config=F))
#extract and scale results
<- age.mod$summary.fitted.values[,3:7]
fitted_rates names(fitted_rates) <- c("q0.025", "q0.25","q0.5","q0.75","q0.975")
<- apply(fitted_rates, 2, exp) fitted_rates
View period of record rates.
<- cbind(grid_expand, fitted_rates)
esti_rates
<- esti_rates %>%
esti_rates group_by(date, age_class) %>%
summarise(q0.025 = median(q0.025),
q0.25 = median(q0.25),
q0.5 = median(q0.5),
q0.75 = median(q0.75),
q0.975 = median(q0.975))
ggplot(esti_rates, aes(date, q0.5, group=age_class, col=age_class), fill=age_class) +
geom_line(linewidth = 1) +
::scale_color_viridis("Age Group",
viridisdiscrete=T,
option = "turbo",
direction = -1,
na.value = "white") +
scale_x_date(date_breaks = "6 month", date_labels = "%b-%d-%Y") +
scale_y_continuous(breaks = scales::pretty_breaks(8), limits = c(0, 40)) +
xlab(" ") +
ylab("Rate per 100,000") +
ggtitle("Influenza Hospitalization") +
theme_classic() +
theme(plot.margin = unit(c(1,0.5,1,0.5), "cm"),
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
panel.background = element_blank(),
plot.background = element_blank(),
panel.border = element_blank(),
legend.title = element_text(size = 16, face = "bold", hjust=0.5),
legend.text = element_text(size=10, face="bold"),
strip.text = element_text(size=16, face="bold"),
strip.background = element_blank(),
legend.position = "bottom",
legend.direction = "horizontal",
legend.key.width = unit(2,"line"),
axis.text.y = element_text(face="bold", size=12),
axis.text.x = element_text(face="bold", size=10, angle = 60, hjust=1),
axis.title.x = element_text(size=18, face="bold"),
axis.title.y = element_text(size=18, face="bold"),
plot.title = element_text(size=25, face="bold", hjust=0.5))
Zoom in to view most recent season. Snapshot for same period provided by FluSurv is below for comparison.
= as_date("2022-10-08")
xmin = as_date("2023-04-29")
xmax
ggplot(esti_rates, aes(date, q0.5, group=age_class, col=age_class), fill=age_class) +
geom_line(linewidth = 1) +
geom_point(size=1.5, pch=19) +
geom_hline(yintercept = 0,
linetype = "solid",
colour = "gray60",
linewidth = 0.75) +
::scale_color_viridis("Age Group",
viridisdiscrete=T,
option = "turbo",
direction = -1,
na.value = "white") +
scale_x_date(date_breaks = "2 week", date_labels = "%b-%d-%Y", limits = c(xmin,xmax)) +
scale_y_continuous(breaks = scales::pretty_breaks(8), limits = c(0, 40)) +
xlab(" ") +
ylab("Rate per 100,000") +
ggtitle("Influenza Hospitalization") +
theme_classic() +
theme(plot.margin = unit(c(2,0.5,2,0.5), "cm"),
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
panel.background = element_blank(),
plot.background = element_blank(),
panel.border = element_blank(),
legend.title = element_text(size = 16, face = "bold", hjust=0.5),
legend.text = element_text(size=10, face="bold"),
strip.text = element_text(size=16, face="bold"),
strip.background = element_blank(),
legend.position = c(0.7, 0.5),
legend.direction = "vertical",
legend.key.width = unit(2,"line"),
axis.text.y = element_text(face="bold", size=14),
axis.text.x = element_text(face="bold", size=14, angle = 60, hjust=1),
axis.title.x = element_text(size=22, face="bold"),
axis.title.y = element_text(size=22, face="bold"),
plot.title = element_text(size=25, face="bold", hjust=0.5))
Warning: Removed 5634 rows containing missing values (`geom_line()`).
Warning: Removed 5634 rows containing missing values (`geom_point()`).
Compare the estimated values to those on FluSurv
Apply rates estimated above to total incidence estimated by flusion data.
Function to apply rates (proportionally) and then join data sets.
# function
# flusion data (flus_data) and estimated rates (rate_est) as inputs
<- function(flus_data, rate_est){
naive_stratify
#date range for analysis
<- min(flus_data$date) #flusion date
min_date <- max(rate_est$date) #FluSurv date
max_date
#age groups
<- unique(rate_est$age_class)
age_grps
# join flusion and rate estimates
for(i in 1:length(age_grps)){
= flus_data %>%
tmp.fuse filter(date >= min_date & date <= max_date) %>%
mutate(age_class = age_grps[i])
= rate_est %>%
rate_est filter(date >= min_date & date <= max_date)
= left_join(tmp.fuse, rate_est, by = c("date", "age_class"))
tmp.fuse
if(i == 1){
<- tmp.fuse
work_flus else{work_flus <- rbind(work_flus, tmp.fuse)}
}
}
#summed rate by date and location (all age groups)
<- work_flus %>% #national weekly sum
fuse_weights group_by(date, location_name) %>%
summarise(s0.025 = sum(q0.025),
s0.25 = sum(q0.25),
s0.5 = sum(q0.5),
s0.75 = sum(q0.75),
s0.975 = sum(q0.975))
# join to data
<- left_join(work_flus, fuse_weights, by=c("date", "location_name"))
work_flus
# calculate proportions and multiply by flusion estimates
<- work_flus %>%
prop_set group_by(date, location_name, age_class) %>%
mutate(n0.025 = q0.025/s0.025,
n0.25 = q0.25/s0.25,
n0.50 = q0.5/s0.5,
n0.75 = q0.75/s0.75,
n0.975 = q0.975/s0.975,
q_0.025 = q_0.025*n0.025,
q_0.25 = q_0.25*n0.25,
q_0.50 = q_0.50*n0.50,
q_0.75 = q_0.75*n0.75,
q_0.975 = q_0.975*n0.975) %>% #select rows to match with flusion
select(date, year,epiweek, abbreviation, location, location_name, age_class, q_0.025, q_0.25, q_0.50, q_0.75, q_0.975)
}
Execute function
<- naive_stratify(flusion, esti_rates) flused_ages
ggplot(flused_ages, aes(date, q_0.50, fill=age_class), col = "transparent") +
geom_bar(position="stack", stat="identity") +
::scale_fill_viridis("Age Group",
viridisdiscrete=T,
option = "turbo",
direction = -1,
na.value = "white") +
scale_x_date(date_breaks = "6 month", date_labels = "%d-%Y") +
xlab(" ") +
ylab("Hospitalizations") +
ggtitle(" ") +
theme_classic() +
theme(plot.margin = unit(c(2,0.5,2,0.5), "cm"),
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
panel.background = element_blank(),
plot.background = element_blank(),
panel.border = element_blank(),
legend.title = element_text(size = 16, face = "bold", hjust=0.5),
legend.text = element_text(size=10, face="bold"),
strip.text = element_text(size=16, face="bold"),
strip.background = element_blank(),
legend.position = "bottom",
legend.direction = "horizontal",
legend.key.width = unit(2,"line"),
axis.text.y = element_text(face="bold", size=14),
axis.text.x = element_text(face="bold", size=14, angle = 60, hjust=1),
axis.title.x = element_text(size=22, face="bold"),
axis.title.y = element_text(size=22, face="bold"),
plot.title = element_text(size=25, face="bold", hjust=0.5))
ggplot(flused_ages, aes(date, q_0.50, fill=age_class), col = "transparent") +
geom_bar(position="stack", stat="identity") +
::scale_fill_viridis("Age Group",
viridisdiscrete=T,
option = "turbo",
direction = -1,
na.value = "white") +
scale_x_date(date_breaks = "2 week", date_labels = "%b-%d-%Y", limits = c(xmin,xmax)) +
xlab(" ") +
ylab("Hospitalizations") +
ggtitle(" ") +
theme_classic() +
theme(plot.margin = unit(c(2,0.5,2,0.5), "cm"),
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
panel.background = element_blank(),
plot.background = element_blank(),
panel.border = element_blank(),
legend.title = element_text(size = 16, face = "bold", hjust=0.5),
legend.text = element_text(size=10, face="bold"),
strip.text = element_text(size=16, face="bold"),
strip.background = element_blank(),
legend.position = c(0.7, 0.5),
legend.direction = "vertical",
legend.key.width = unit(2,"line"),
axis.text.y = element_text(face="bold", size=14),
axis.text.x = element_text(face="bold", size=14, angle = 60, hjust=1),
axis.title.x = element_text(size=22, face="bold"),
axis.title.y = element_text(size=22, face="bold"),
plot.title = element_text(size=25, face="bold", hjust=0.5))