Commit 76e847da authored by Xuefei Zhang's avatar Xuefei Zhang
Browse files

Upload New File

parent a0e1044f
# author: Xuefei Zhang and Boang Liu
# This script process set for testing, using first L years predictor.
library(haven)
library(survival)
#### process outcome ####
# read outcome data and use first L years as test data
data_outcome = read.csv('./sample_data/outcome.csv')
L = 2 # L could set as other values, such as L = 4, etc.
data_outcome = data_outcome[data_outcome$DaysToOutcome > 365.25*L,] #keep those at least L years;
dim(data_outcome)
head(data_outcome)
data_outcome = data_outcome[order(data_outcome$PatientID),]
head(data_outcome)
table(data_outcome$Outcome)
K = 1 # K could set as other values, such as K = 3, 5, etc.
data_outcome_LandK = data.frame(PatientID=data_outcome$PatientID, Outcome_LandK=NA)
for (i in 1:nrow(data_outcome)) {
if (i %% 1000 == 0) {print(i)}
if (data_outcome$DaysToOutcome[i] >= 365.25*(L+K)) {
data_outcome_LandK$Outcome_LandK[i] = 0
}else if (data_outcome$Outcome[i] == 1) {
data_outcome_LandK$Outcome_LandK[i] = 1
}
}
write.csv(data_outcome_LandK,
file='./sample_data/dataOutcome_Kyear_outcome_Lyear_predictor.csv',
row.names = F)
###### The following: only processed once, do not need to repeat for K = 3 and K = 5
#### process predictors ####
data_start = data_outcome
data_start$DaysToOutcome = 365.25 * L
data_start$Outcome = 0
data1 = tmerge(data_start, data_start, id=PatientID, OutcomeTVC=event(DaysToOutcome, Outcome),
options= list(idname="PatientID"))
head(data1)
var_list <- c('ALB','ALK','ALT','AST','BIL','BUN', 'CL', 'CRE',
'GLU', 'HEM', 'PLT', 'K', 'Na', 'WBC', 'APRI',
'ASTALT')
file_list <- c('labalbumin', 'labalkalinephosphataseratio', 'labaltratio',
'labastratio', 'labbilirubin', 'labbloodureanitro',
'labchloride', 'labcreatinine', 'labglucose',
'labhemoglobin', 'labplt', 'labpotassium', 'labsodium', 'labwbc', 'ScoreAPRI', 'scoreastalt')
n_var = length(var_list)
for(i in 1:n_var){
print(i)
print(c(file_list[i], var_list[i]))
data2 = read.csv(paste0('./sample_data/', file_list[i], '.csv'))
data2 = data2[is.element(data2$PatientID, data_start$PatientID),]
data2 = data2[order(data2$PatientID, data2$DaysSinceEnrollment),]
data1 = tmerge(data1, data2, id=PatientID, new_col=tdc(DaysSinceEnrollment, Value))
names(data1)[names(data1)=="new_col"] = var_list[i]
print(dim(data1))
}
svr_new <- read.csv('./sample_data/treatment.csv')
data2 <- svr_new
data2 <- data2[is.element(data2$PatientID, data_start$PatientID),]
data2 <- data2[order(data2$PatientID, data2$DaysSinceEnrollment),]
data1 <- tmerge(data1, data2, id=PatientID, new_col=tdc(DaysSinceEnrollment, Value))
names(data1)[names(data1)=="new_col"] = 'SVR'
# remove patients with missing SVR
data_enrollment <- data1[!duplicated(data1$PatientID),] # first record of each patient
pid_SVR_missing <- data_enrollment$PatientID[is.na(data_enrollment$SVR)] # 10248 patients
data1 <- data1[!is.element(data1$PatientID, pid_SVR_missing), ]
#### add demo and impute missing ####
data1 <- read.csv('TimeVaryingCovariate/data_processed/dataTest_4year_predictor_SVRMissingRemoved_v1.csv')
demo <- read_sas('./Cohort/cohort_demographics.sas7bdat')
data1 <- merge(data1, demo, by = 'PatientID')
data1$AgeAtFirstAPRI2[data1$AgeAtFirstAPRI2 == 999] <- 90
data1$Race[data1$Race == ''] <- "MISSING"
data1$Race <- as.factor(data1$Race)
data1$Gender <- as.factor(data1$Gender)
data1$genotype_main <- NULL
## impute missing
data_enrollment <- data1[!duplicated(data1$PatientID),] # first record of each patient
# record the locations of missing values
missing_idx <- which(is.na(data_enrollment), arr.ind = T)
dim(missing_idx)
table(data_enrollment$SVR)
na_count <- apply(data_enrollment, 2, function(x){sum(is.na(x))})
max(na_count)
# impute by median first
fillByMedian <- function(x) {
x[is.na(x)] <- median(x, na.rm = TRUE)
return(x)
}
data_enrollment_median_imputed <- apply(data_enrollment[,var_list], 2, fillByMedian)
data_enrollment[,var_list] = data_enrollment_median_imputed
summary(data_enrollment)
data_enrollment_new = data_enrollment[,c('PatientID', var_list)]
var_list_first = paste0(names(data_enrollment_new)[-1], '_FIRST')
names(data_enrollment_new)[-1] = var_list_first
intersect(names(data1), names(data_enrollment_new)) #"PatientID"
data1_new = merge(data1, data_enrollment_new, by = 'PatientID')
cbind(var_list, var_list_first)
for (i in 1:length(var_list)) {
print(c(var_list[i], var_list_first[i]))
rows_missing = is.na(data1_new[, var_list[i]])
data1_new[rows_missing, var_list[i]] = data1_new[rows_missing, var_list_first[i]]
}
summary(data1_new)
data1_new2 = data1_new[, names(data1)]
write.csv(data1_new2,
file='./sample_data/TVC_dataframe_test.csv',
row.names = F)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment