"""
---------------------------------------------------------------------------
Created on Fri April 10 10:32:32 2023
----------------------------------------------------------------------------
**Title:** ValidPath Toolbox - Uncertainty Analysis module
**Description:** This is the Uncertainty Analysis module of the ValidPath toolbox. It is includes Uncertainty_Analysis class and several methods
**Classes:** Uncertainty_Analysis
**Methods:** get_report, auc_keras_, ci_, Delong_CI, compute_midrank, compute_midrank_weight, calc_pvalue, compute_ground_truth_statistics, delong_roc_variance, bootstrapping
---------------------------------------------------------------------------
Author: SeyedM.MousaviKahaki (seyed.kahaki@fda.hhs.gov)
Version ='1.0'
---------------------------------------------------------------------------
"""
import numpy as np
from sklearn.metrics import roc_auc_score
import scipy.stats
from scipy import stats
import os, os.path
import pandas as pd
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, cohen_kappa_score
from scipy.stats import norm
from sklearn.metrics import auc
import matplotlib.pyplot as plt
[docs]
class Uncertainty_Analysis:
def __init__(self):
self.perform_Delong = False
self.perform_Bootstrap = False
self.plot_roc = False
self.tag = 'My Results'
self.n_bootstraps = 1000
pass
[docs]
def get_report (self, y_pred ,y_truth) :
""" This method recieve the machine learning prediction output and the ground truth and report several metrics. This is the main metod of the Uncertainty_Analysis class which calls other methods to procude results.
:Parameters:
y_truth: ground_truth - np.array of 0 and 1
y_pred: predictions - np.array of floats of the probability of being class 1
:Returns:
precision
Precision Conficenc Interval
Recall
Recall Conficenc Interval
AUC based on delong method and its Conficenc Interval and COV
False Positive Rate
True Positive Rate
AUC
Confusion Matrix
"""
cmtx = pd.DataFrame(
confusion_matrix(y_truth.round(), y_pred.round(), labels=[1, 0]),
index=['true:yes', 'true:no'],
columns=['pred:yes', 'pred:no']
)
############### Precision
precision = precision_score(y_truth.round(), y_pred.round())
############### Recall
recall = recall_score(y_truth.round(), y_pred.round())
# ############### F1 score
f1_score1 = f1_score(y_truth.round(), y_pred.round())
# ############### Cohen's kappa
cohen_kappa_score1 = cohen_kappa_score(y_truth.round(), y_pred.round())
confusion_m = confusion_matrix(y_truth.round(), y_pred.round())
TP = confusion_m[1][1]
FP = confusion_m[0][1]
TN = confusion_m[0][0]
FN = confusion_m[1][0]
n = TP+FP
Precision_CI = self.ci_(TP, n, alpha=0.05)
n = TP+FN
Recall_CI = self.ci_(TP, n, alpha=0.05)
fpr_keras, tpr_keras, thresholds_keras = roc_curve(y_truth, y_pred)
auc_keras = self.auc_keras_(fpr_keras, tpr_keras)
if self.perform_Delong:
auc_delong, ci_delong, lower_upper_q, auc_delong_cov, auc_std = self.Delong_CI(y_pred ,y_truth)
else:
auc_delong = ci_delong = lower_upper_q = auc_delong_cov = auc_std = []
########## Print All Together
print('####################')
print("Results for "+self.tag)
print(cmtx)
print("Precision: ",precision)
print("Precision_CI: ",Precision_CI)
print("Recall: ",recall)
print("Recall_CI: ",Recall_CI)
if self.perform_Delong:
print("Delong Method")
print('AUC:', auc_delong)
print('AUC COV:', auc_delong_cov)
print('95% AUC CI:', ci_delong)
# ### Bootstrap
if self.perform_Bootstrap:
bootstrapped_scores, confidence_lower, confidence_upper = self.bootstrapping(y_truth, y_pred)
else:
bootstrapped_scores = confidence_lower = confidence_upper = {}
if self.plot_roc ==True :
# ### AUC Plot
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_keras, tpr_keras, label='keras (AUC = {:.3f})'.format(auc_keras))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()
return (precision,Precision_CI,recall,Recall_CI, auc_delong ,auc_delong_cov ,ci_delong , fpr_keras, tpr_keras ,auc_keras ,cmtx)
[docs]
def auc_keras_(self, fpr_keras, tpr_keras):
""" Estimates confidence interval for Bernoulli p
:Parameters:
fpr_keras: False Positive Rate Values
tpr_keras: True Positive Rate Values
:Returns:
AUC: Area Under the ROC Curve
"""
auc_keras = auc(fpr_keras, tpr_keras)
#return auc_keras
[docs]
def ci_(self, tp, n, alpha=0.05):
""" Estimates confidence interval for Bernoulli p
:Parameters:
tp: number of positive outcomes, TP in this case
n: number of attemps, TP+FP for Precision, TP+FN for Recall
alpha: confidence level
:Returns:
Tuple[float, float]: lower and upper bounds of the confidence interval
"""
p_hat = float(tp) / n
z_score = norm.isf(alpha * 0.5) # two sides, so alpha/2 on each side
variance_of_sum = p_hat * (1-p_hat) / n
std = variance_of_sum ** 0.5
return p_hat - z_score * std, p_hat + z_score * std
[docs]
def Delong_CI(self, y_pred ,y_truth):
""" A Python implementation of an algorithm for computing the statistical significance of comparing two sets of predictions by ROC AUC.
Also can compute variance of a single ROC AUC estimate. X. Sun and W. Xu, "Fast Implementation of DeLong’s Algorithm for Comparing the
Areas Under Correlated Receiver Operating Characteristic Curves," in IEEE Signal Processing Letters, vol. 21, no. 11, pp. 1389-1393, Nov. 2014,
doi: 10.1109/LSP.2014.2337313.
:Parameters:
y_truth: ground_truth - np.array of 0 and 1
y_pred: predictions - np.array of floats of the probability of being class 1
:Returns:
auc, ci, lower_upper_q, auc_cov, auc_std
"""
alpha = .95
y_true = np.array(y_truth)
auc, auc_cov = self.delong_roc_variance(
y_true,
y_pred)
auc_std = np.sqrt(auc_cov)
lower_upper_q = np.abs(np.array([0, 1]) - (1 - alpha) / 2)
ci = stats.norm.ppf(
lower_upper_q,
loc=auc,
scale=auc_std)
ci[ci > 1] = 1
return auc, ci, lower_upper_q, auc_cov, auc_std
# stackoverflow.com/questions/19124239/scikit-learn-roc-curve-with-confidence-intervals
[docs]
def compute_midrank(self,x):
"""Computes midranks.
:Parameters:
x - a 1D numpy array
:Returns:
array of midranks
"""
J = np.argsort(x)
Z = x[J]
N = len(x)
T = np.zeros(N, dtype=float)
i = 0
while i < N:
j = i
while j < N and Z[j] == Z[i]:
j += 1
T[i:j] = 0.5*(i + j - 1)
i = j
T2 = np.empty(N, dtype=float)
# Note(kazeevn) +1 is due to Python using 0-based indexing
# instead of 1-based in the AUC formula in the paper
T2[J] = T + 1
return T2
[docs]
def compute_midrank_weight(self, x, sample_weight):
"""Computes midranks.
:Parameters:
x - a 1D numpy array
:Returns:
array of midranks
"""
J = np.argsort(x)
Z = x[J]
cumulative_weight = np.cumsum(sample_weight[J])
N = len(x)
T = np.zeros(N, dtype=float)
i = 0
while i < N:
j = i
while j < N and Z[j] == Z[i]:
j += 1
T[i:j] = cumulative_weight[i:j].mean()
i = j
T2 = np.empty(N, dtype=float)
T2[J] = T
return T2
[docs]
def fastDeLong(self, predictions_sorted_transposed, label_1_count, sample_weight):
if sample_weight is None:
return self.fastDeLong_no_weights(predictions_sorted_transposed, label_1_count)
else:
return self.fastDeLong_weights(predictions_sorted_transposed, label_1_count, sample_weight)
[docs]
def fastDeLong_weights(self, predictions_sorted_transposed, label_1_count, sample_weight):
"""
The fast version of DeLong's method for computing the covariance of
unadjusted AUC.
:Parameters:
predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
sorted such as the examples with label "1" are first
:Returns:
(AUC value, DeLong covariance)
:Reference:
@article{sun2014fast,
title={Fast Implementation of DeLong's Algorithm for Comparing the Areas Under Correlated Receiver Oerating Characteristic Curves},
author={Xu Sun and Weichao Xu},
journal={IEEE Signal Processing Letters},
volume={21},
number={11},
pages={1389--1393},
year={2014},
publisher={IEEE}
}
"""
# Short variables are named as they are in the paper
m = label_1_count
n = predictions_sorted_transposed.shape[1] - m
positive_examples = predictions_sorted_transposed[:, :m]
negative_examples = predictions_sorted_transposed[:, m:]
k = predictions_sorted_transposed.shape[0]
tx = np.empty([k, m], dtype=float)
ty = np.empty([k, n], dtype=float)
tz = np.empty([k, m + n], dtype=float)
for r in range(k):
tx[r, :] = self.compute_midrank_weight(positive_examples[r, :], sample_weight[:m])
ty[r, :] = self.compute_midrank_weight(negative_examples[r, :], sample_weight[m:])
tz[r, :] = self.compute_midrank_weight(predictions_sorted_transposed[r, :], sample_weight)
total_positive_weights = sample_weight[:m].sum()
total_negative_weights = sample_weight[m:].sum()
pair_weights = np.dot(sample_weight[:m, np.newaxis], sample_weight[np.newaxis, m:])
total_pair_weights = pair_weights.sum()
aucs = (sample_weight[:m]*(tz[:, :m] - tx)).sum(axis=1) / total_pair_weights
v01 = (tz[:, :m] - tx[:, :]) / total_negative_weights
v10 = 1. - (tz[:, m:] - ty[:, :]) / total_positive_weights
sx = np.cov(v01)
sy = np.cov(v10)
delongcov = sx / m + sy / n
return aucs, delongcov
[docs]
def fastDeLong_no_weights(self, predictions_sorted_transposed, label_1_count):
"""
The fast version of DeLong's method for computing the covariance of
unadjusted AUC.
:Parameters:
predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
sorted such as the examples with label "1" are first
:Returns:
(AUC value, DeLong covariance)
Reference:
@article{sun2014fast,
title={Fast Implementation of DeLong's Algorithm for Comparing the Areas Under Correlated Receiver Oerating Characteristic Curves},
author={Xu Sun and Weichao Xu},
journal={IEEE Signal Processing Letters},
volume={21},
number={11},
pages={1389--1393},
year={2014},
publisher={IEEE}
}
"""
# Short variables are named as they are in the paper
m = label_1_count
n = predictions_sorted_transposed.shape[1] - m
positive_examples = predictions_sorted_transposed[:, :m]
negative_examples = predictions_sorted_transposed[:, m:]
k = predictions_sorted_transposed.shape[0]
tx = np.empty([k, m], dtype=float)
ty = np.empty([k, n], dtype=float)
tz = np.empty([k, m + n], dtype=float)
for r in range(k):
tx[r, :] = self.compute_midrank(positive_examples[r, :])
ty[r, :] = self.compute_midrank(negative_examples[r, :])
tz[r, :] = self.compute_midrank(predictions_sorted_transposed[r, :])
aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n
v01 = (tz[:, :m] - tx[:, :]) / n
v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m
sx = np.cov(v01)
sy = np.cov(v10)
delongcov = sx / m + sy / n
return aucs, delongcov
[docs]
def calc_pvalue(self, aucs, sigma):
"""Computes log(10) of p-values.
:Parameters:
aucs: 1D array of AUCs
sigma: AUC DeLong covariances
:Returns:
log10(pvalue)
"""
l = np.array([[1, -1]])
z = np.abs(np.diff(aucs)) / np.sqrt(np.dot(np.dot(l, sigma), l.T))
return np.log10(2) + scipy.stats.norm.logsf(z, loc=0, scale=1) / np.log(10)
[docs]
def compute_ground_truth_statistics(self, ground_truth, sample_weight):
assert np.array_equal(np.unique(ground_truth), [0, 1])
order = (-ground_truth).argsort()
label_1_count = int(ground_truth.sum())
if sample_weight is None:
ordered_sample_weight = None
else:
ordered_sample_weight = sample_weight[order]
return order, label_1_count, ordered_sample_weight
[docs]
def delong_roc_variance(self, ground_truth, predictions, sample_weight=None):
"""
Computes ROC AUC variance for a single set of predictions
:Parameters:
ground_truth: np.array of 0 and 1
predictions: np.array of floats of the probability of being class 1
"""
order, label_1_count, ordered_sample_weight = self.compute_ground_truth_statistics(
ground_truth, sample_weight)
predictions_sorted_transposed = predictions[np.newaxis, order]
aucs, delongcov = self.fastDeLong(predictions_sorted_transposed, label_1_count, ordered_sample_weight)
assert len(aucs) == 1
return aucs[0], delongcov
[docs]
def bootstrapping(self, y_true, y_pred):
"""
Computes ROC AUC variance for a single set of predictions
:Parameters:
ground_truth: np.array of 0 and 1
predictions: np.array of floats of the probability of being class 1
"""
print("Original ROC area: {:0.3f}".format(roc_auc_score(y_true, y_pred)))
rng_seed = 42 # control reproducibility
bootstrapped_scores = []
rng = np.random.RandomState(rng_seed)
for i in range(self.n_bootstraps):
# bootstrap by sampling with replacement on the prediction indices
indices = rng.randint(0, len(y_pred), len(y_pred))
if len(np.unique(y_true[indices])) < 2:
# We need at least one positive and one negative sample for ROC AUC
# to be defined: reject the sample
continue
score = roc_auc_score(y_true[indices], y_pred[indices])
bootstrapped_scores.append(score)
print("Bootstrap #{} ROC area: {:0.3f}".format(i + 1, score))
plt.hist(bootstrapped_scores, bins=50)
plt.title('Histogram of the bootstrapped ROC AUC scores')
plt.show()
sorted_scores = np.array(bootstrapped_scores)
sorted_scores.sort()
# Computing the lower and upper bound of the 90% confidence interval
# You can change the bounds percentiles to 0.025 and 0.975 to get
# a 95% confidence interval instead.
confidence_lower = sorted_scores[int(0.05 * len(sorted_scores))]
confidence_upper = sorted_scores[int(0.95 * len(sorted_scores))]
print("Bootstrap")
print("Confidence interval for the score: [{:0.3f} - {:0.3}]".format(
confidence_lower, confidence_upper))
return bootstrapped_scores, confidence_lower, confidence_upper