library(sureLDA)## Loading required package: Matrixlibrary(pROC)## Type 'citation("pROC")' for a citation.## 
## Attaching package: 'pROC'## The following objects are masked from 'package:stats':
## 
##     cov, smooth, varLet N denote the number of patients, W the number of EHR features, and K the number of target phenotypes to be predicted. Our input data consists of 1) X, an NxW matrix of EHR feature counts, 2) ICD, an NxK matrix of key ICD code counts for each target phenotype, 3) NLP, an NxK matrix of key NLP feature counts for each target phenotype, 4) HU, an N-dimensional vector of healthcare utilization measurements (i.e. total patient encounters in a patient’s chart), and 5) an NxK matrix of filter indicators for each target phenotype (we assume P(phenotype | filter=0) = 0).
First, we evaluate sureLDA with a PheNorm-generated prior (default) for prediction of 10 target phenotypes using a simulated dataset. We employ 10 ‘empty’ topics (this should generally be set in the range of 10-100).
surelda_run_phenorm <- with(
  simdata, sureLDA(X, ICD, NLP, HU, filter, nEmpty = 10))## Starting PheNorm## Starting Guided LDA## Starting final clusteringEvaluating AUCs of sureLDA scores across 10 phenotypes
surelda_scores_phenorm_aucs <- sapply(1:ncol(simdata$filter),function(k){
  pROC::auc(simdata$Y[,k],surelda_run_phenorm$scores[,k])
})Evaluating AUCs of predicted probabilities across 10 phenotypes
surelda_ensemble_phenorm_aucs <- sapply(1:ncol(simdata$filter),function(k){
  auc(simdata$Y[,k],surelda_run_phenorm$ensemble[,k])
})AUCs:
surelda_result_combined <- rbind(surelda_scores_phenorm_aucs,surelda_ensemble_phenorm_aucs)
rownames(surelda_result_combined) <- c('sureLDA Scores','sureLDA Probs')
print(surelda_result_combined)##                     [,1]      [,2]      [,3]      [,4]      [,5]      [,6]
## sureLDA Scores 0.9106220 0.8673095 0.8748029 0.9114881 0.8786685 0.8759985
## sureLDA Probs  0.9110456 0.8637459 0.8726527 0.9097024 0.8771739 0.8645873
##                     [,7]      [,8]      [,9]     [,10]
## sureLDA Scores 0.9959016 0.7775945 0.8735303 0.8731959
## sureLDA Probs  0.9873634 0.7690034 0.8666961 0.8962199Next, we evaluate sureLDA’s predictions of the same 10 target phenotypes using the same data but given the prior and phi estimators from the previous run.
surelda_prediction <- with(simdata,
                        sureLDA(X, ICD, NLP, HU, filter, prior = surelda_run_phenorm$prior, nEmpty = 10,
                              weight = surelda_run_phenorm$weight, phi = surelda_run_phenorm$phi))## Inferring theta given provided phi## Starting final clusteringEvaluating AUCs of sureLDA scores across 10 phenotypes
surelda_scores_prediction_aucs <- sapply(1:ncol(simdata$filter),function(k){
  auc(simdata$Y[,k],surelda_prediction$scores[,k])
})Evaluating AUCs of predicted probabilities across 10 phenotypes
surelda_ensemble_prediction_aucs <- sapply(1:ncol(simdata$filter),function(k){
  auc(simdata$Y[,k],surelda_prediction$ensemble[,k])
})AUCs:
surelda_prediction_result_combined <- rbind(surelda_scores_prediction_aucs,surelda_ensemble_prediction_aucs)
rownames(surelda_prediction_result_combined) <- c('sureLDA Scores','sureLDA Probs')
print(surelda_prediction_result_combined)##                     [,1]      [,2]      [,3]      [,4]      [,5]      [,6]
## sureLDA Scores 0.9112489 0.8667855 0.8717209 0.9122321 0.8727717 0.8697223
## sureLDA Probs  0.9128416 0.8655277 0.8731902 0.9132143 0.8772283 0.8689616
##                     [,7]      [,8]      [,9]     [,10]
## sureLDA Scores 0.9953893 0.7754639 0.8683128 0.9025430
## sureLDA Probs  0.9889003 0.7758763 0.8668430 0.8988316Total time spent:
proc.time()##    user  system elapsed 
##  77.285   4.192  82.224