Note: The default ITS GitLab runner is a shared resource and is subject to slowdowns during heavy usage.
You can run your own GitLab runner that is dedicated just to your group if you need to avoid processing delays.

Commit 76e847da authored by Xuefei Zhang's avatar Xuefei Zhang
Browse files

Upload New File

parent a0e1044f
# author: Xuefei Zhang and Boang Liu
# This script process set for testing, using first L years predictor.
library(haven)
library(survival)
#### process outcome ####
# read outcome data and use first L years as test data
data_outcome = read.csv('./sample_data/outcome.csv')
L = 2 # L could set as other values, such as L = 4, etc.
data_outcome = data_outcome[data_outcome$DaysToOutcome > 365.25*L,] #keep those at least L years;
dim(data_outcome)
head(data_outcome)
data_outcome = data_outcome[order(data_outcome$PatientID),]
head(data_outcome)
table(data_outcome$Outcome)
K = 1 # K could set as other values, such as K = 3, 5, etc.
data_outcome_LandK = data.frame(PatientID=data_outcome$PatientID, Outcome_LandK=NA)
for (i in 1:nrow(data_outcome)) {
if (i %% 1000 == 0) {print(i)}
if (data_outcome$DaysToOutcome[i] >= 365.25*(L+K)) {
data_outcome_LandK$Outcome_LandK[i] = 0
}else if (data_outcome$Outcome[i] == 1) {
data_outcome_LandK$Outcome_LandK[i] = 1
}
}
write.csv(data_outcome_LandK,
file='./sample_data/dataOutcome_Kyear_outcome_Lyear_predictor.csv',
row.names = F)
###### The following: only processed once, do not need to repeat for K = 3 and K = 5
#### process predictors ####
data_start = data_outcome
data_start$DaysToOutcome = 365.25 * L
data_start$Outcome = 0
data1 = tmerge(data_start, data_start, id=PatientID, OutcomeTVC=event(DaysToOutcome, Outcome),
options= list(idname="PatientID"))
head(data1)
var_list <- c('ALB','ALK','ALT','AST','BIL','BUN', 'CL', 'CRE',
'GLU', 'HEM', 'PLT', 'K', 'Na', 'WBC', 'APRI',
'ASTALT')
file_list <- c('labalbumin', 'labalkalinephosphataseratio', 'labaltratio',
'labastratio', 'labbilirubin', 'labbloodureanitro',
'labchloride', 'labcreatinine', 'labglucose',
'labhemoglobin', 'labplt', 'labpotassium', 'labsodium', 'labwbc', 'ScoreAPRI', 'scoreastalt')
n_var = length(var_list)
for(i in 1:n_var){
print(i)
print(c(file_list[i], var_list[i]))
data2 = read.csv(paste0('./sample_data/', file_list[i], '.csv'))
data2 = data2[is.element(data2$PatientID, data_start$PatientID),]
data2 = data2[order(data2$PatientID, data2$DaysSinceEnrollment),]
data1 = tmerge(data1, data2, id=PatientID, new_col=tdc(DaysSinceEnrollment, Value))
names(data1)[names(data1)=="new_col"] = var_list[i]
print(dim(data1))
}
svr_new <- read.csv('./sample_data/treatment.csv')
data2 <- svr_new
data2 <- data2[is.element(data2$PatientID, data_start$PatientID),]
data2 <- data2[order(data2$PatientID, data2$DaysSinceEnrollment),]
data1 <- tmerge(data1, data2, id=PatientID, new_col=tdc(DaysSinceEnrollment, Value))
names(data1)[names(data1)=="new_col"] = 'SVR'
# remove patients with missing SVR
data_enrollment <- data1[!duplicated(data1$PatientID),] # first record of each patient
pid_SVR_missing <- data_enrollment$PatientID[is.na(data_enrollment$SVR)] # 10248 patients
data1 <- data1[!is.element(data1$PatientID, pid_SVR_missing), ]
#### add demo and impute missing ####
data1 <- read.csv('TimeVaryingCovariate/data_processed/dataTest_4year_predictor_SVRMissingRemoved_v1.csv')
demo <- read_sas('./Cohort/cohort_demographics.sas7bdat')
data1 <- merge(data1, demo, by = 'PatientID')
data1$AgeAtFirstAPRI2[data1$AgeAtFirstAPRI2 == 999] <- 90
data1$Race[data1$Race == ''] <- "MISSING"
data1$Race <- as.factor(data1$Race)
data1$Gender <- as.factor(data1$Gender)
data1$genotype_main <- NULL
## impute missing
data_enrollment <- data1[!duplicated(data1$PatientID),] # first record of each patient
# record the locations of missing values
missing_idx <- which(is.na(data_enrollment), arr.ind = T)
dim(missing_idx)
table(data_enrollment$SVR)
na_count <- apply(data_enrollment, 2, function(x){sum(is.na(x))})
max(na_count)
# impute by median first
fillByMedian <- function(x) {
x[is.na(x)] <- median(x, na.rm = TRUE)
return(x)
}
data_enrollment_median_imputed <- apply(data_enrollment[,var_list], 2, fillByMedian)
data_enrollment[,var_list] = data_enrollment_median_imputed
summary(data_enrollment)
data_enrollment_new = data_enrollment[,c('PatientID', var_list)]
var_list_first = paste0(names(data_enrollment_new)[-1], '_FIRST')
names(data_enrollment_new)[-1] = var_list_first
intersect(names(data1), names(data_enrollment_new)) #"PatientID"
data1_new = merge(data1, data_enrollment_new, by = 'PatientID')
cbind(var_list, var_list_first)
for (i in 1:length(var_list)) {
print(c(var_list[i], var_list_first[i]))
rows_missing = is.na(data1_new[, var_list[i]])
data1_new[rows_missing, var_list[i]] = data1_new[rows_missing, var_list_first[i]]
}
summary(data1_new)
data1_new2 = data1_new[, names(data1)]
write.csv(data1_new2,
file='./sample_data/TVC_dataframe_test.csv',
row.names = F)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment