Commit ee585340 authored by Xuefei Zhang's avatar Xuefei Zhang
Browse files

Upload New File

parent 76e847da
# author: Xuefei Zhang and Boang Liu
# This script process training patients whose have survived for more than K (e.g., K = 1, 3, 5) years
# It merges separate lab files into one data matrix that can be put into TVC model.
library(haven)
library(missForest)
library(survival)
var_list <- c('ALB','ALK','ALT','AST','BIL','BUN', 'CL', 'CRE',
'GLU', 'HEM', 'PLT', 'K', 'Na', 'WBC', 'APRI',
'ASTALT')
file_list <- c('labalbumin', 'labalkalinephosphataseratio', 'labaltratio',
'labastratio', 'labbilirubin', 'labbloodureanitro',
'labchloride', 'labcreatinine', 'labglucose',
'labhemoglobin', 'labplt', 'labpotassium', 'labsodium', 'labwbc', 'ScoreAPRI', 'scoreastalt')
outcome <- read.csv('outcome.csv')
K <- 1 # K could be set as other values, such as 3, 5, etc.
outcome_Kyear <- outcome[outcome$DaysToOutcome >= K * 365.25, ] # select patients who have survived longer than K years
outcome_Kyear$DaysToOutcome <- outcome_Kyear$DaysToOutcome - K * 365.25 # redefined time to outcome
pid_Kyear <- outcome_Kyear$PatientID # extract patient ids
# merge lab files sequencially; at each time, read a new lab file and merge it into the current data frame
# tmerge() function merges two data frames with a finer time split points (see ?tmerge() for examples)
data0 <- outcome_Kyear
min(data0$DaysToOutcome)
data0 <- data0[order(data0$PatientID),]
data1 <- tmerge(data0, data0, id=PatientID, OutcomeTVC=event(DaysToOutcome, Outcome),
options= list(idname="PatientID"))
# head(data1) # take a quick check of the merged data frame
for (i in 1:length(file_list)){
print(c(file_list[i], var_list[i]))
data2 <- read.csv(paste0('./sample_data/', file_list[i], '.csv'))
data2 <- data2[is.element(data2$PatientID, pid_Kyear),]
data2 <- data2[order(data2$PatientID, data2$DaysSinceEnrollment),]
data1 <- tmerge(data1, data2, id=PatientID, new_col=tdc(DaysSinceEnrollment, Value))
names(data1)[names(data1)=="new_col"] = var_list[i]
print(dim(data1))
# head(data1)
}
# merge SVR (treatment variable) into the data frame
svr_new <- read.csv('./sample_data/treatment.csv')
data2 <- svr_new
colnames(data2) <- c('PatientID', 'DaysSinceEnrollment', 'Value')
data2 <- data2[is.element(data2$PatientID, pid_Kyear),]
data2 <- data2[order(data2$PatientID, data2$DaysSinceEnrollment),]
data1 <- tmerge(data1, data2, id=PatientID, new_col=tdc(DaysSinceEnrollment, Value))
names(data1)[names(data1)=="new_col"] = 'SVR'
# remove patients with missing SVR
data_enrollment <- data1[!duplicated(data1$PatientID),] # first record of each patient
pid_SVR_missing <- data_enrollment$PatientID[is.na(data_enrollment$SVR)]
data1 <- data1[!is.element(data1$PatientID, pid_SVR_missing), ]
## merge demographics
demo <- read_sas('./sample_data/demographics.csv')
data1 <- merge(data1, demo, by = 'PatientID')
data1$Race[data1$Race == ''] <- "MISSING"
data1$Race <- as.factor(data1$Race)
data1$Gender <- as.factor(data1$Gender)
## impute missing values if there is any
data_enrollment <- data1[!duplicated(data1$PatientID),] # first record of each patient
missing_idx <- which(is.na(data_enrollment), arr.ind = T)
dim(missing_idx)
na_count <- apply(data_enrollment, 2, function(x){sum(is.na(x))})
max(na_count)
# impute by median
fillByMedian <- function(x) {
x[is.na(x)] <- median(x, na.rm = TRUE)
return(x)
}
data_enrollment_median_imputed <- apply(data_enrollment[,var_list], 2, fillByMedian)
data_enrollment[,var_list] = data_enrollment_median_imputed
summary(data_enrollment)
data_enrollment_new = data_enrollment[,c('PatientID', var_list)]
var_list_first = paste0(names(data_enrollment_new)[-1], '_FIRST')
names(data_enrollment_new)[-1] = var_list_first
intersect(names(data1), names(data_enrollment_new)) # "PatientID"
data1_new = merge(data1, data_enrollment_new, by = 'PatientID')
cbind(var_list, var_list_first)
for (i in 1:length(var_list)) {
print(c(var_list[i], var_list_first[i]))
rows_missing = is.na(data1_new[, var_list[i]])
data1_new[rows_missing, var_list[i]] = data1_new[rows_missing, var_list_first[i]]
}
summary(data1_new)
data1_new2 = data1_new[, names(data1)]
write.csv(data1_new2, file = './sample_data/TVC_dataframe_all.csv', row.names = F)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment