Preprocessing

Import and wrangle mutlisourced influenza observations

Preliminaries

Setup working environment and loading necessary packages.

Libraries

Hide code
#wrangling
library(tidyverse)
library(lubridate)
library(data.table, include.only = "fread")
library(cdcfluview)
library(yaml)

#spatial
library(sp)
library(sf)
library(spdep)
library(rgeos)
library(igraph)
library(maptools)
library(mapproj)
library(CovidCAR)
#devtools::install_github("JMHumphreys/CovidCAR")

#messages
library(cli)

#inference
library(INLA)

#Utilities
#source("./R/utilities.R")

options(dplyr.summarise.inform = FALSE)

#function
download_file <- function(url, filename) {
  download.file(url, destfile = filename, method = "auto", quiet = FALSE, mode = "wb")
}

Observation Data

FluSurv

Build a flu hospitalization data file from individual state reports and all available years. The hospitalizations() function from the cdcfluview package does most of the work by querying FluView, but seems only to be able to take small bites at a time.

Hide code
myRegions <- surveillance_areas() 

flusurv_all <- do.call(rbind, lapply(seq_len(dim(myRegions)[1]), function(i) {
  hospitalizations(surveillance_area = myRegions$surveillance_area[i], region = myRegions$region[i])
}))

Wrangle Flusurv data:
Note that the cdcfluview package only includes through Spring of 2020. Because of this, the data is filtered at 2019 and a static file manually downloaded from FluView with more recnt reports (eventually this data will be moved to www.healthdata.gov).

Hide code
range(flusurv_all$year)
[1] 2003 2020
Hide code
flusurv <- flusurv_all %>%
  filter(age_label == "Overall",
         region != "Entire Network",
         year >= 2010 & year <= 2019) %>% #the pkg fails on dates after 2020,ugh
  mutate(location_name = region,
         network = surveillance_area,
         weeklyrate = as.numeric(weeklyrate),
         epiweek = year_wk_num) %>%
    select(location_name, year, epiweek, network, rate, weeklyrate)
  
#manual download from site 2023-06-01
flusurv_2020 <- fread("D:/Github/flusion/data/FluSurveillance_2020.csv") %>%
  rename_all(~gsub(" |-", "", .)) %>%
  filter(AGECATEGORY == "Overall",
         SEXCATEGORY == "Overall",
         RACECATEGORY == "Overall",
         CATCHMENT != "Entire Network",
         MMWRYEAR >= 2020) %>% #Prior to this date was downloaded in code above
  mutate(location_name = CATCHMENT,
         network = NETWORK,
         year = MMWRYEAR,
         epiweek = MMWRWEEK,
         rate = CUMULATIVERATE,
         weeklyrate = as.numeric(WEEKLYRATE)) %>%
    select(location_name, year, epiweek, network, rate, weeklyrate)

#Join date ranges and scale weeklyrate
flusurv = rbind(flusurv, flusurv_2020)
flusurv$weeklyrate.s = as.numeric(scale(flusurv$weeklyrate, scale = T, center=T))

#combine NY data
flusurv$location_name[flusurv$location_name == "New York - Albany"] = "New York"
flusurv$location_name[flusurv$location_name == "New York - Rochester"] ="New York"

flusurv <- flusurv %>%
  group_by(location_name, year, epiweek) %>%
  summarise(rate = mean(rate, na.rm=T),
            weeklyrate = mean(weeklyrate, na.rm=T),
            weeklyrate.s = mean(weeklyrate.s, na.rm=T))


#Check for duplicates
unique(duplicated(flusurv))
[1] FALSE
Hide code
dim(flusurv)
[1] 5676    6
Hide code
head(flusurv)
location_name year epiweek rate weeklyrate weeklyrate.s
California 2010 1 NA 0.4 -0.4771582
California 2010 2 NA 0.2 -0.5561987
California 2010 3 NA 0.2 -0.5561987
California 2010 4 NA 0.2 -0.5561987
California 2010 5 NA 0.2 -0.5561987
California 2010 6 NA 0.1 -0.5957189

Same FluSurv process as above, but now for the Full Network reports

Hide code
flusurv_en <- flusurv_all %>%
  filter(age_label == "Overall",
         region == "Entire Network",
         year >= 2010 & year <= 2019) %>% #the pkg fails on dates after 2020,ugh
  mutate(location_name = region,
         network = surveillance_area,
         epiweek = year_wk_num) %>%
    select(location_name, year, epiweek, network, rate, weeklyrate)

flusurv_en2020 <- fread("D:/Github/flusion/data/FluSurveillance_2020.csv") %>%
  rename_all(~gsub(" |-", "", .)) %>%
  filter(AGECATEGORY == "Overall",
         SEXCATEGORY == "Overall",
         RACECATEGORY == "Overall",
         CATCHMENT == "Entire Network",
         MMWRYEAR >= 2020) %>% #the pkg fails on dates after 2020,ugh
  mutate(location_name = CATCHMENT,
         network = NETWORK,
         year = MMWRYEAR,
         epiweek = MMWRWEEK,
         rate = CUMULATIVERATE,
         weeklyrate = WEEKLYRATE) %>%
    select(location_name, year, epiweek, network, rate, weeklyrate)

#Join
flusurv_en <- rbind(flusurv_en, flusurv_en2020)
unique(duplicated(flusurv_en))
[1] FALSE
Hide code
dim(flusurv_en)
[1] 1131    6
Hide code
head(flusurv_en)
location_name year epiweek network rate weeklyrate
Entire Network 2016 40 FluSurv-NET 0.1 0.1
Entire Network 2016 41 FluSurv-NET 0.2 0.1
Entire Network 2016 42 FluSurv-NET 0.3 0.1
Entire Network 2016 43 FluSurv-NET 0.4 0.1
Entire Network 2016 44 FluSurv-NET 0.6 0.1
Entire Network 2016 45 FluSurv-NET 0.8 0.2

ILINet Surveillance Data

Influenza Like Illness (ILI) data using cdcfluview package. Unlike FluSurv, data is available 2010-2013.

Hide code
ilinet <- ilinet(region = "state") %>%
  mutate(location_name = region,
         epiweek = week,
         unweighted = as.numeric(unweighted_ili),
         unweighted.s = unweighted,
         total = ilitotal,
         providers = num_of_providers) %>%
  select(location_name, year, epiweek, unweighted, unweighted.s, total, providers)

# Clip to between 0.0001 and 99.999
ilinet$unweighted.s <- pmin(pmax(as.numeric(ilinet$unweighted.s), 0.01), 99.99)/100

# logit transform 
ilinet$unweighted.s <- log(ilinet$unweighted.s/(1-ilinet$unweighted.s))
range(ilinet$unweighted.s, na.rm=T)
[1] -9.210240 -1.423094
Hide code
unique(duplicated(ilinet))
[1] FALSE
Hide code
dim(ilinet)
[1] 35678     7
Hide code
range(ilinet$year)
[1] 2010 2023
Hide code
head(ilinet)
location_name year epiweek unweighted unweighted.s total providers
Alabama 2010 40 2.134770 -3.825232 249 35
Alaska 2010 40 0.875146 -4.729745 15 7
Arizona 2010 40 0.674721 -4.991856 172 49
Arkansas 2010 40 0.696056 -4.960510 18 15
California 2010 40 1.954120 -3.915496 632 112
Colorado 2010 40 0.660684 -5.013021 134 14

COVID19 RPIHC

Downloading the COVID-19 Reported Patient Impact and Hospital Capacity by State Timeseries from https://healthdata.gov/.

Hide code
url <- "https://healthdata.gov/api/views/g62h-syeh/rows.csv?accessType=DOWNLOAD"
filename <- "D:/Github/flusion/data/flu_HHS.csv"

download_file(url, filename)

Read and wrangle RPIHC:

Hide code
flu_HHS <- fread("D:/Github/flusion/data/flu_HHS.csv") %>%
  mutate(abbreviation = state,
         date = as_date(date) - 1, #1-day prior, per fluSight truth
         year = year(date),
         epiweek = epiweek(date)) %>%
  group_by(abbreviation, year, epiweek) %>%
  summarise(hosp_inc = sum(previous_day_admission_influenza_confirmed))

unique(duplicated(flu_HHS))
[1] FALSE
Hide code
dim(flu_HHS)
[1] 9346    4
Hide code
range(flu_HHS$year)
[1] 2019 2023
Hide code
head(flu_HHS)
abbreviation year epiweek hosp_inc
AK 2020 13 NA
AK 2020 14 NA
AK 2020 15 NA
AK 2020 16 NA
AK 2020 17 NA
AK 2020 18 NA

NREVSS

National Respiratory and Enteric Virus Surveillance System.

Again, unfortunately only available through a manual: https://gis.cdc.gov/grasp/fluview/fluportaldashboard.html
Files illustrated here were downloaded on June 2, 2023.

Hide code
nrevss.1 <- fread("D:/Github/flusion/data/WHO_NREVSS_Combined_prior_to_2015_16.csv") %>%
  rename_all(~gsub(" |-", "", .)) %>%
  mutate(location_name = REGION,
         year = YEAR,
         epiweek = WEEK,
         tot_perc = as.numeric(PERCENTPOSITIVE),
         Bpos = as.numeric(B),
         tot_samp = as.numeric(TOTALSPECIMENS),
         Apos = tot_perc - ((Bpos/tot_samp)*100)) %>%
  select(location_name, year, epiweek, Apos)


nrevss.2 <- fread("D:/Github/flusion/data/WHO_NREVSS_Clinical_Labs.csv") %>%
  rename_all(~gsub(" |-", "", .)) %>%
  mutate(location_name = REGION,
         year = YEAR,
         epiweek = WEEK,
         Apos = PERCENTA) %>%
  select(location_name, year, epiweek, Apos) #Apos = Influenza A positive

# Combine 
nrevss <- rbind(nrevss.1, nrevss.2)

# Replace "X" 
nrevss$Apos[nrevss$Apos == "X"] <- NA

# Clip to between 0.0001 and 99.999
nrevss$Apos <- pmin(pmax(as.numeric(nrevss$Apos), 0.01), 99.99)/100

# logit transform 
nrevss$Apos.s <- log(nrevss$Apos/(1-nrevss$Apos))
unique(duplicated(nrevss))
[1] FALSE
Hide code
head(nrevss)
location_name year epiweek Apos Apos.s
Alabama 2010 40 0.0001000 -9.210240
Alaska 2010 40 0.0001000 -9.210240
Arizona 2010 40 0.0250000 -3.663562
Arkansas 2010 40 0.0001000 -9.210240
California 2010 40 0.0273355 -3.571852
Colorado 2010 40 0.0079000 -4.832961

Location Template

Location Table

Location codes and population numbers from FluSight. Should really get varying census over time period, but these are a start.

Hide code
url <- "https://github.com/cdcepi/Flusight-forecast-data/raw/master/data-locations/locations.csv"
filename <- "D:/Github/flusion/data/locations.csv"

download_file(url, filename)

Read locations:

Hide code
locations <- fread("D:/Github/flusion/data/locations.csv") %>%
  select(-c(count_rate1per100k, count_rate2per100k)) %>%
  filter(location_name != "US") #remove aggregate group

head(locations)
abbreviation location location_name population
AL 01 Alabama 5039877
AK 02 Alaska 732673
AZ 04 Arizona 7276316
AR 05 Arkansas 3025891
CA 06 California 39237836
CO 08 Colorado 5812069

Template

Ensure all locations and times are represented in the analysis. Missing data are plugged with NA.

Dates and Locations

Hide code
myYears <- seq(2010, 2023, by = 1)
week_nums <- 1:52

year_set <- lapply(myYears, function(year) {
  tmp_frame <- locations %>% mutate(year = year)
  
  weekly_set <- lapply(week_nums, function(week_num) {
    tmp_frame_wk <- tmp_frame %>% mutate(epiweek = week_num)
    return(tmp_frame_wk)
  })
  
  weekly_set <- do.call(rbind, weekly_set)
  return(weekly_set)
})

template <- do.call(rbind, year_set)

dim(template)
[1] 38584     6
Hide code
head(template) #all states and times represented
abbreviation location location_name population year epiweek
AL 01 Alabama 5039877 2010 1
AK 02 Alaska 732673 2010 1
AZ 04 Arizona 7276316 2010 1
AR 05 Arkansas 3025891 2010 1
CA 06 California 39237836 2010 1
CO 08 Colorado 5812069 2010 1
Hide code
first_record <- min(subset(nrevss, year == 2010)$epiweek) #drop before nrevss data was collected
template$drop_old <- ifelse(template$year == 2010 & template$epiweek < first_record, "drop", "keep")

#most_recent <- max(subset(flu_HHS, year == 2023)$epiweek) #drop future dates
most_recent <- 22 #date available when model run
template$drop_new <- ifelse(template$year == 2023 & template$epiweek > most_recent, "drop", "keep")

template <- template %>% 
  filter(drop_old == "keep" &
         drop_new == "keep") %>%
  select(-c(drop_old, drop_new))

unique(duplicated(template))
[1] FALSE

Time Steps

An index with sequential timesteps.

Hide code
template <- template %>%
  arrange(year, epiweek) %>%
  mutate(ts_week = as.integer(as.factor(year + (epiweek/53))))


range(template$ts_week) #number of epiweeks
[1]   1 659

Spatial Domian

Need to setup directories to use CovidCAR functions. Values here are arbitrary (need to add option to bypass to CovidCAR)

Hide code
setup_analysis(report_date = "2010-01-01", 
               training_period = 2*28, #days
               forecast_horizon = 28, #days
               output_dir = "D:/Github/flusion/data"
)

Get Geographic Boundaries

Hide code
States <- download_boundaries(unit = "state")
Reading layer `us-state-boundaries' from data source 
  `D:\Github\flusion\data\2010-01-01-CovidCAR-run2023-06-14\polygons' 
  using driver `ESRI Shapefile'
Simple feature collection with 56 features and 20 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -179.2311 ymin: -14.60181 xmax: 179.8597 ymax: 71.44069
Geodetic CRS:  WGS 84
Hide code
class(States)
[1] "SpatialPolygonsDataFrame"
attr(,"package")
[1] "sp"
Hide code
head(States@data[,c("Region", "State")]) #appended attributes  
Region State
1 Virgin Islands
2 Wisconsin
3 Vermont
4 New Jersey
5 Colorado
6 South Carolina

Adjacency

Hide code
nb_flusion = get_neighbors(States, connect=TRUE)
summary(nb_flusion)
Neighbour list object:
Number of regions: 56 
Number of nonzero links: 242 
Percentage nonzero weights: 7.716837 
Average number of links: 4.321429 
Link number distribution:

 1  2  3  4  5  6  7  8 
 5  3 10 12 10 11  3  2 
5 least connected regions:
1 13 38 42 54 with 1 link
2 most connected regions:
49 56 with 8 links
Hide code
#view
plot_neighbors(States, nb_flusion)

Hide code
#convert to matrix
nb2INLA("J", nb_flusion)
J = inla.read.graph("J")

Template Spatial Index

Hide code
template$Region =  with(States@data[,c("Region", "State")],
                       Region[match(
                        template$location_name,
                                 State)])

Add Entire Network

Getting the flue weeklyrate from FluSurv. Not that this is only the time trend from FluSurv and is not loaction specific.

Hide code
en_match <- flusurv_en %>%
  group_by(year, epiweek) %>%
  summarise(en_est = mean(as.numeric(weeklyrate), na.rm=TRUE)) %>%
  select(year, epiweek, en_est)

# Clip to between 0.0001 and 99.999
en_match$en_est.s <- pmin(pmax(as.numeric(en_match$en_est), 0.01), 99.99)/100

# logit transform 
en_match$en_est.s <- round(log(en_match$en_est.s/(1-en_match$en_est.s)), 3)
range(en_match$en_est.s, na.rm=T)
[1] -9.21 -2.19
Hide code
template <- left_join(template, en_match, by = c("year", "epiweek"))
unique(duplicated(template))
[1] FALSE

Space-Time Interaction Index

All location-time combinations.

Hide code
template$st_int <- as.integer(as.factor(paste0(template$Region, ".", template$ts_week)))
range(template$st_int)
[1]     1 34927

Join to Disease Data

Join observation data to the template. Times and locations without observations are coded as NA.

Hide code
#FluSurv
flusurv_full <- left_join(template, flusurv, by = c("location_name", "year", "epiweek")) 
flusurv_full$network[is.na(flusurv_full$network)] = "none"
unique(duplicated(flusurv_full))
[1] FALSE
Hide code
head(flusurv_full) #times and locations w/out values assigned NA
abbreviation location location_name population year epiweek ts_week Region en_est en_est.s st_int rate weeklyrate weeklyrate.s network
AL 01 Alabama 5039877 2010 40 1 21 0.0333333 -8.006 8568 NA NA NA NA
AK 02 Alaska 732673 2010 40 1 38 0.0333333 -8.006 19771 NA NA NA NA
AZ 04 Arizona 7276316 2010 40 1 11 0.0333333 -8.006 1319 NA NA NA NA
AR 05 Arkansas 3025891 2010 40 1 34 0.0333333 -8.006 17135 NA NA NA NA
CA 06 California 39237836 2010 40 1 47 0.0333333 -8.006 25702 NA 0.1 -0.5957189 NA
CO 08 Colorado 5812069 2010 40 1 5 0.0333333 -8.006 27679 NA 0.1 -0.5957189 NA
Hide code
#ILI Surveillance
ilinet_full <- left_join(template, ilinet, by = c("location_name", "year", "epiweek"))
unique(duplicated(ilinet_full))
[1] FALSE
Hide code
head(ilinet_full)
abbreviation location location_name population year epiweek ts_week Region en_est en_est.s st_int unweighted unweighted.s total providers
AL 01 Alabama 5039877 2010 40 1 21 0.0333333 -8.006 8568 2.134770 -3.825232 249 35
AK 02 Alaska 732673 2010 40 1 38 0.0333333 -8.006 19771 0.875146 -4.729745 15 7
AZ 04 Arizona 7276316 2010 40 1 11 0.0333333 -8.006 1319 0.674721 -4.991856 172 49
AR 05 Arkansas 3025891 2010 40 1 34 0.0333333 -8.006 17135 0.696056 -4.960510 18 15
CA 06 California 39237836 2010 40 1 47 0.0333333 -8.006 25702 1.954120 -3.915496 632 112
CO 08 Colorado 5812069 2010 40 1 5 0.0333333 -8.006 27679 0.660684 -5.013021 134 14
Hide code
#HHS 
flu_HHS_full <- left_join(template, flu_HHS, by = c("abbreviation", "year", "epiweek"))
unique(duplicated(flu_HHS_full))
[1] FALSE
Hide code
unique(duplicated(flu_HHS_full[,c("ts_week", "Region")]))
[1] FALSE
Hide code
head(flu_HHS_full)
abbreviation location location_name population year epiweek ts_week Region en_est en_est.s st_int hosp_inc
AL 01 Alabama 5039877 2010 40 1 21 0.0333333 -8.006 8568 NA
AK 02 Alaska 732673 2010 40 1 38 0.0333333 -8.006 19771 NA
AZ 04 Arizona 7276316 2010 40 1 11 0.0333333 -8.006 1319 NA
AR 05 Arkansas 3025891 2010 40 1 34 0.0333333 -8.006 17135 NA
CA 06 California 39237836 2010 40 1 47 0.0333333 -8.006 25702 NA
CO 08 Colorado 5812069 2010 40 1 5 0.0333333 -8.006 27679 NA
Hide code
#nrevss 
nrevss_full <- left_join(template, nrevss, by = c("location_name", "year", "epiweek"))
unique(duplicated(nrevss_full))
[1] FALSE
Hide code
head(nrevss_full)
abbreviation location location_name population year epiweek ts_week Region en_est en_est.s st_int Apos Apos.s
AL 01 Alabama 5039877 2010 40 1 21 0.0333333 -8.006 8568 0.0001000 -9.210240
AK 02 Alaska 732673 2010 40 1 38 0.0333333 -8.006 19771 0.0001000 -9.210240
AZ 04 Arizona 7276316 2010 40 1 11 0.0333333 -8.006 1319 0.0250000 -3.663562
AR 05 Arkansas 3025891 2010 40 1 34 0.0333333 -8.006 17135 0.0001000 -9.210240
CA 06 California 39237836 2010 40 1 47 0.0333333 -8.006 25702 0.0273355 -3.571852
CO 08 Colorado 5812069 2010 40 1 5 0.0333333 -8.006 27679 0.0079000 -4.832961

View Coverage

Looking at holes in the data. Not interested in exact values, only comparing data coverage.

Hide code
fs_plt <- flusurv_full %>% 
  mutate(value = weeklyrate.s,
         set = "FluSurv") %>%
  select(location_name, ts_week, value, set)

ili_plt <- ilinet_full %>% 
  mutate(value = unweighted.s,
          set = "ILI") %>%
  select(location_name, ts_week, value, set)

hhs_plt <- flu_HHS_full %>% 
  mutate(value = log(hosp_inc+0.0001),
          set = "HHS") %>%
  select(location_name, ts_week, value, set)

nrevss_plt <- nrevss_full %>% 
  mutate(value = Apos.s,
          set = "NREVSS") %>%
  select(location_name, ts_week, value, set)


all_plts <- rbind(fs_plt, ili_plt, hhs_plt, nrevss_plt)
all_plts$set <- ordered(factor(all_plts$set), levels = c("FluSurv", "ILI", "NREVSS", "HHS"))

ggplot() +
geom_tile(data=all_plts,
          aes(ts_week, location_name, fill = value)) +
xlab(" ") +
viridis::scale_fill_viridis(paste0(" "),
                             discrete=F,
                             option = "turbo",
                             direction = -1,
                             na.value = "white") +
ylab("Location") +
xlab("Weekly Timesteps (2010-2023)") +
facet_wrap(~set, ncol = 4) +
theme(panel.grid.minor = element_blank(),
      panel.grid.major = element_blank(),
      panel.background = element_blank(),
      plot.background = element_blank(),
      panel.border = element_blank(),
      legend.title = element_text(size = 16, face = "bold", hjust=0.5),
      legend.text = element_text(size=10, face="bold"),
      strip.text = element_text(size=16, face="bold"),
      strip.background = element_blank(),
      legend.position="none", 
      legend.direction = "horizontal",
      legend.box = "horizontal",
      axis.text.y = element_text(face="bold", size=5),
      axis.text.x = element_text(face="bold", size=12, vjust=0.5,
                                 hjust=1, angle=90),
      axis.title.x = element_text(size=12, face="bold"),
      axis.title.y = element_text(size=18, face="bold"),
      plot.title = element_text(size=18, face="bold", hjust=0.5)) +
guides(color = guide_legend(title.position = "top", label.position = "bottom"))