Note: The default ITS GitLab runner is a shared resource and is subject to slowdowns during heavy usage.
You can run your own GitLab runner that is dedicated just to your group if you need to avoid processing delays.

Commit ee585340 authored by Xuefei Zhang's avatar Xuefei Zhang
Browse files

Upload New File

parent 76e847da
# author: Xuefei Zhang and Boang Liu
# This script process training patients whose have survived for more than K (e.g., K = 1, 3, 5) years
# It merges separate lab files into one data matrix that can be put into TVC model.
library(haven)
library(missForest)
library(survival)
var_list <- c('ALB','ALK','ALT','AST','BIL','BUN', 'CL', 'CRE',
'GLU', 'HEM', 'PLT', 'K', 'Na', 'WBC', 'APRI',
'ASTALT')
file_list <- c('labalbumin', 'labalkalinephosphataseratio', 'labaltratio',
'labastratio', 'labbilirubin', 'labbloodureanitro',
'labchloride', 'labcreatinine', 'labglucose',
'labhemoglobin', 'labplt', 'labpotassium', 'labsodium', 'labwbc', 'ScoreAPRI', 'scoreastalt')
outcome <- read.csv('outcome.csv')
K <- 1 # K could be set as other values, such as 3, 5, etc.
outcome_Kyear <- outcome[outcome$DaysToOutcome >= K * 365.25, ] # select patients who have survived longer than K years
outcome_Kyear$DaysToOutcome <- outcome_Kyear$DaysToOutcome - K * 365.25 # redefined time to outcome
pid_Kyear <- outcome_Kyear$PatientID # extract patient ids
# merge lab files sequencially; at each time, read a new lab file and merge it into the current data frame
# tmerge() function merges two data frames with a finer time split points (see ?tmerge() for examples)
data0 <- outcome_Kyear
min(data0$DaysToOutcome)
data0 <- data0[order(data0$PatientID),]
data1 <- tmerge(data0, data0, id=PatientID, OutcomeTVC=event(DaysToOutcome, Outcome),
options= list(idname="PatientID"))
# head(data1) # take a quick check of the merged data frame
for (i in 1:length(file_list)){
print(c(file_list[i], var_list[i]))
data2 <- read.csv(paste0('./sample_data/', file_list[i], '.csv'))
data2 <- data2[is.element(data2$PatientID, pid_Kyear),]
data2 <- data2[order(data2$PatientID, data2$DaysSinceEnrollment),]
data1 <- tmerge(data1, data2, id=PatientID, new_col=tdc(DaysSinceEnrollment, Value))
names(data1)[names(data1)=="new_col"] = var_list[i]
print(dim(data1))
# head(data1)
}
# merge SVR (treatment variable) into the data frame
svr_new <- read.csv('./sample_data/treatment.csv')
data2 <- svr_new
colnames(data2) <- c('PatientID', 'DaysSinceEnrollment', 'Value')
data2 <- data2[is.element(data2$PatientID, pid_Kyear),]
data2 <- data2[order(data2$PatientID, data2$DaysSinceEnrollment),]
data1 <- tmerge(data1, data2, id=PatientID, new_col=tdc(DaysSinceEnrollment, Value))
names(data1)[names(data1)=="new_col"] = 'SVR'
# remove patients with missing SVR
data_enrollment <- data1[!duplicated(data1$PatientID),] # first record of each patient
pid_SVR_missing <- data_enrollment$PatientID[is.na(data_enrollment$SVR)]
data1 <- data1[!is.element(data1$PatientID, pid_SVR_missing), ]
## merge demographics
demo <- read_sas('./sample_data/demographics.csv')
data1 <- merge(data1, demo, by = 'PatientID')
data1$Race[data1$Race == ''] <- "MISSING"
data1$Race <- as.factor(data1$Race)
data1$Gender <- as.factor(data1$Gender)
## impute missing values if there is any
data_enrollment <- data1[!duplicated(data1$PatientID),] # first record of each patient
missing_idx <- which(is.na(data_enrollment), arr.ind = T)
dim(missing_idx)
na_count <- apply(data_enrollment, 2, function(x){sum(is.na(x))})
max(na_count)
# impute by median
fillByMedian <- function(x) {
x[is.na(x)] <- median(x, na.rm = TRUE)
return(x)
}
data_enrollment_median_imputed <- apply(data_enrollment[,var_list], 2, fillByMedian)
data_enrollment[,var_list] = data_enrollment_median_imputed
summary(data_enrollment)
data_enrollment_new = data_enrollment[,c('PatientID', var_list)]
var_list_first = paste0(names(data_enrollment_new)[-1], '_FIRST')
names(data_enrollment_new)[-1] = var_list_first
intersect(names(data1), names(data_enrollment_new)) # "PatientID"
data1_new = merge(data1, data_enrollment_new, by = 'PatientID')
cbind(var_list, var_list_first)
for (i in 1:length(var_list)) {
print(c(var_list[i], var_list_first[i]))
rows_missing = is.na(data1_new[, var_list[i]])
data1_new[rows_missing, var_list[i]] = data1_new[rows_missing, var_list_first[i]]
}
summary(data1_new)
data1_new2 = data1_new[, names(data1)]
write.csv(data1_new2, file = './sample_data/TVC_dataframe_all.csv', row.names = F)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment