diff --git a/generate_annotations_with_malignancy.ipynb b/generate_annotations_with_malignancy.ipynb new file mode 100644 index 0000000..498636d --- /dev/null +++ b/generate_annotations_with_malignancy.ipynb @@ -0,0 +1,433 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Malignancy Annotations\n", + "\n", + "This notebook compiles the `annotations_with_malignancy.csv` and also drops annotations for CTs it cannot find.\n", + "\n", + "In addition to the usual suspects, you need to have the `pylidc` Python package (use `pip install pylidc` or [check out the source](https://pylidc.github.io/)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import SimpleITK as sitk\n", + "import pandas\n", + "import glob, os\n", + "import numpy\n", + "import tqdm\n", + "import pylidc\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We first load the annotations from the LUNA challenge." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "annotations = pandas.read_csv('data/part2/luna/annotations.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "scrolled": false + }, + "source": [ + "For the CTs where we have a `.mhd` file, we collect the malignancy_data from PyLIDC.\n", + "\n", + "It is a bit tedious as we need to convert the pixel locations provided by PyLIDC to physical points.\n", + "We will see some warnings about annotations to be too close too each other (PyLIDC expects to have 4 annotations per site, see Chapter 14 for some details, including when we consider a nodule to be malignant).\n", + "\n", + "This takes quite a while (~1-2 seconds per scan on one of the author's computer)." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 11%|█▏ | 69/601 [01:52<13:05, 1.48s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Failed to reduce all groups to <= 4 Annotations.\n", + "Some nodules may be close and must be grouped manually.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 15%|█▌ | 93/601 [02:31<14:46, 1.75s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Failed to reduce all groups to <= 4 Annotations.\n", + "Some nodules may be close and must be grouped manually.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 18%|█▊ | 107/601 [02:53<14:35, 1.77s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Failed to reduce all groups to <= 4 Annotations.\n", + "Some nodules may be close and must be grouped manually.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 37%|███▋ | 225/601 [06:16<11:28, 1.83s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Failed to reduce all groups to <= 4 Annotations.\n", + "Some nodules may be close and must be grouped manually.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 44%|████▍ | 267/601 [07:24<07:51, 1.41s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Failed to reduce all groups to <= 4 Annotations.\n", + "Some nodules may be close and must be grouped manually.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 47%|████▋ | 281/601 [07:46<09:37, 1.80s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Failed to reduce all groups to <= 4 Annotations.\n", + "Some nodules may be close and must be grouped manually.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 61%|██████ | 368/601 [10:16<06:19, 1.63s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Failed to reduce all groups to <= 4 Annotations.\n", + "Some nodules may be close and must be grouped manually.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 72%|███████▏ | 434/601 [11:57<03:41, 1.32s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Failed to reduce all groups to <= 4 Annotations.\n", + "Some nodules may be close and must be grouped manually.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 74%|███████▍ | 446/601 [12:20<03:09, 1.22s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Failed to reduce all groups to <= 4 Annotations.\n", + "Some nodules may be close and must be grouped manually.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 75%|███████▍ | 450/601 [12:26<03:49, 1.52s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Failed to reduce all groups to <= 4 Annotations.\n", + "Some nodules may be close and must be grouped manually.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 88%|████████▊ | 527/601 [14:15<01:35, 1.29s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Failed to reduce all groups to <= 4 Annotations.\n", + "Some nodules may be close and must be grouped manually.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 96%|█████████▌| 577/601 [15:17<00:38, 1.59s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Failed to reduce all groups to <= 4 Annotations.\n", + "Some nodules may be close and must be grouped manually.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 99%|█████████▉| 597/601 [15:44<00:06, 1.66s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Failed to reduce all groups to <= 4 Annotations.\n", + "Some nodules may be close and must be grouped manually.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 601/601 [15:48<00:00, 1.58s/it]\n" + ] + } + ], + "source": [ + "malignancy_data = []\n", + "missing = []\n", + "spacing_dict = {}\n", + "scans = {s.series_instance_uid:s for s in pylidc.query(pylidc.Scan).all()}\n", + "suids = annotations.seriesuid.unique()\n", + "for suid in tqdm.tqdm(suids):\n", + " fn = glob.glob('./data-unversioned/part2/luna/subset*/{}.mhd'.format(suid))\n", + " if len(fn) == 0 or '*' in fn[0]:\n", + " missing.append(suid)\n", + " continue\n", + " fn = fn[0]\n", + " x = sitk.ReadImage(fn)\n", + " spacing_dict[suid] = x.GetSpacing()\n", + " s = scans[suid]\n", + " for ann_cluster in s.cluster_annotations():\n", + " # this is our malignancy criteron described in Chapter 14\n", + " is_malignant = len([a.malignancy for a in ann_cluster if a.malignancy >= 4])>=2\n", + " centroid = numpy.mean([a.centroid for a in ann_cluster], 0)\n", + " bbox = numpy.mean([a.bbox_matrix() for a in ann_cluster], 0).T\n", + " coord = x.TransformIndexToPhysicalPoint([int(numpy.round(i)) for i in centroid[[1, 0, 2]]])\n", + " bbox_low = x.TransformIndexToPhysicalPoint([int(numpy.round(i)) for i in bbox[0, [1, 0, 2]]])\n", + " bbox_high = x.TransformIndexToPhysicalPoint([int(numpy.round(i)) for i in bbox[1, [1, 0, 2]]])\n", + " malignancy_data.append((suid, coord[0], coord[1], coord[2], bbox_low[0], bbox_low[1], bbox_low[2], bbox_high[0], bbox_high[1], bbox_high[2], is_malignant, [a.malignancy for a in ann_cluster]))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can check how many `mhd`s you are missing. It seems that the LUNA data has dropped a couple(?). Don't worry if there are <10 missing." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MISSING []\n" + ] + } + ], + "source": [ + "print(\"MISSING\", missing)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We stick the data we got from PyLIDC into a DataFrame." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "df_mal = pandas.DataFrame(malignancy_data, columns=['seriesuid', 'coordX', 'coordY', 'coordZ', 'bboxLowX', 'bboxLowY', 'bboxLowZ', 'bboxHighX', 'bboxHighY', 'bboxHighZ', 'mal_bool', 'mal_details'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And now we match the malignancy data to the annotations. This is a lot faster..." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 601/601 [00:01<00:00, 316.12it/s]\n" + ] + } + ], + "source": [ + "processed_annot = []\n", + "annotations['mal_bool'] = float('nan')\n", + "annotations['mal_details'] = [[] for _ in annotations.iterrows()]\n", + "bbox_keys = ['bboxLowX', 'bboxLowY', 'bboxLowZ', 'bboxHighX', 'bboxHighY', 'bboxHighZ']\n", + "for k in bbox_keys:\n", + " annotations[k] = float('nan')\n", + "for series_id in tqdm.tqdm(annotations.seriesuid.unique()):\n", + " # series_id = '1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222365663678666836860'\n", + " # c = candidates[candidates.seriesuid == series_id]\n", + " a = annotations[annotations.seriesuid == series_id]\n", + " m = df_mal[df_mal.seriesuid == series_id]\n", + " if len(m) > 0:\n", + " m_ctrs = m[['coordX', 'coordY', 'coordZ']].values\n", + " a_ctrs = a[['coordX', 'coordY', 'coordZ']].values\n", + " #print(m_ctrs.shape, a_ctrs.shape)\n", + " matches = (numpy.linalg.norm(a_ctrs[:, None] - m_ctrs[None], ord=2, axis=-1) / a.diameter_mm.values[:, None] < 0.5)\n", + " has_match = matches.max(-1)\n", + " match_idx = matches.argmax(-1)[has_match]\n", + " a_matched = a[has_match].copy()\n", + " # c_matched['diameter_mm'] = a.diameter_mm.values[match_idx]\n", + " a_matched['mal_bool'] = m.mal_bool.values[match_idx]\n", + " a_matched['mal_details'] = m.mal_details.values[match_idx]\n", + " for k in bbox_keys:\n", + " a_matched[k] = m[k].values[match_idx]\n", + " processed_annot.append(a_matched)\n", + " processed_annot.append(a[~has_match])\n", + " else:\n", + " processed_annot.append(c)\n", + "processed_annot = pandas.concat(processed_annot)\n", + "processed_annot.sort_values('mal_bool', ascending=False, inplace=True)\n", + "processed_annot['len_mal_details'] = processed_annot.mal_details.apply(len)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we drop NAs (where we didn't find a match) and save it in the right place." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "df_nona = processed_annot.dropna()\n", + "df_nona.to_csv('./data/part2/luna/annotations_with_malignancy.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}