@article{c30c00b1882e47e89ad9afccf51906f1,
title = "Comparison of metrics for the evaluation of medical segmentations using prostate MRI dataset",
abstract = "Nine previously proposed segmentation evaluation metrics, targeting medical relevance, accounting for holes, and added regions or differentiating over- and under-segmentation, were compared with 24 traditional metrics to identify those which better capture the requirements for clinical segmentation evaluation. Evaluation was first performed using 2D synthetic shapes to highlight features and pitfalls of the metrics with known ground truths (GTs) and machine segmentations (MSs). Clinical evaluation was then performed using publicly-available prostate images of 20 subjects with MSs generated by 3 different deep learning networks (DenseVNet, HighRes3DNet, and ScaleNet) and GTs drawn by 2 readers. The same readers also performed the 2D visual assessment of the MSs using a dual negative-positive grading of −5 to 5 to reflect over- and under-estimation. Nine metrics that correlated well with visual assessment were selected for further evaluation using 3 different network ranking methods - based on a single metric, normalizing the metric using 2 GTs, and ranking the network based on a metric then averaging, including leave-one-out evaluation. These metrics yielded consistent ranking with HighRes3DNet ranked first then DenseVNet and ScaleNet using all ranking methods. Relative volume difference yielded the best positivity-agreement and correlation with dual visual assessment, and thus is better for providing over- and under-estimation. Interclass Correlation yielded the strongest correlation with the absolute visual assessment (0–5). Symmetric-boundary dice consistently yielded good discrimination of the networks for all three ranking methods with relatively small variations within network. Good rank discrimination may be an additional metric feature required for better network performance evaluation.",
keywords = "Deep learning, Evaluation metrics, Medical image segmentation, Prostate cancer, Rank evaluation",
author = "Nai, {Ying Hwey} and Teo, {Bernice W.} and Tan, {Nadya L.} and Sophie O'Doherty and Stephenson, {Mary C.} and Thian, {Yee Liang} and Edmund Chiong and Anthonin Reilhac",
note = "Funding Information: All information-theoretic (MI and VOI), pair-counting (PRI and ARI), and error of spatial-overlap-based (GCE), and some probabilistic-based metrics, (ICC, KAP, AUC, and MARK) metrics were affected by matrix size (Table 2). Therefore, these metrics are not suitable for comparison across images of different matrix sizes. Moreover, information-theoretic-based metrics yielded poor correlation with VA (Fig. 4) and hence are not suitable for clinical segmentation evaluation. PRI also yielded a poor correlation with VA (Fig. 4), with no change in values in the evaluation of the synthetic shape (Table 2), thus it is not a good metric. Conversely, ARI showed a strong correlation with VA across the three networks and for both readers (Fig. 4). ICC and KAP were consistently ranked as the top 2 metrics with the strongest correlation with VA (Fig. 4) and good synthetic shape outcomes (Table 2). Volume-based metrics (VS and RVD) are not affected by matrix size, but could not differentiate other features apart from volume (Table 2). RVD thus may only be suitable in providing over- and under-estimation as supported by better plus-minus-sign agreement with visual assessment, compared to CF, FPR, and FNR, and had the best correlation with VA (Fig. 4). All spatial-distance-based metrics were not affected by matrix size (Table 2). HD and BLD only yielded the expected values for (a) to (c), and ASD, RMSD, and BLD yield values with ranking more similar to visual expectations for the fixed shapes evaluation with (a) and (b) performing better than (c) and (d). Particularly, ASD yielded a high correlation with VA with |ρ| ≥ 0.7 (Fig. 4). For spatial-overlap-based metrics, only DSC and JAC yielded values similar to expectations for the fixed shapes evaluation (Table 2) and yielded a high correlation (ρ ≥ 0.7) with VA (Fig. 4). HI and nFDR did not perform better than the other commonly-applied metrics for segmentation evaluation and yielded only average correlation with VA (|ρ| ≥ 0.6).This study was supported by the National University Health System (NUHS) Center Grant Seed Funding, Singapore [NUHSCGSF/2019/07]. Funding Information: This study was supported by the National University Health System (NUHS) Center Grant Seed Funding, Singapore [ NUHSCGSF/2019/07 ]. Publisher Copyright: {\textcopyright} 2021 The Author(s)",
year = "2021",
month = jul,
doi = "10.1016/j.compbiomed.2021.104497",
language = "English",
volume = "134",
journal = "Computers in Biology and Medicine",
issn = "0010-4825",
}