diff --git a/examples/extract_key_values_from_cbc_reports_in_different_layout.ipynb b/examples/extract_key_values_from_cbc_reports_in_different_layout.ipynb index e573076..9529498 100644 --- a/examples/extract_key_values_from_cbc_reports_in_different_layout.ipynb +++ b/examples/extract_key_values_from_cbc_reports_in_different_layout.ipynb @@ -194,95 +194,99 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[[[{'NAME': 'Test Patient A',\n", - " 'HAEMOGLOBIN': '12.6',\n", - " 'RBC (RED CELLS COUNT)': '4.54',\n", - " 'HAEMATOCRIT(PCV)': '38.1',\n", - " 'MCV': '83.9',\n", - " 'MCH': '27.8',\n", - " 'MCHC': '33.1',\n", - " 'TOTAL LEUCOCYTE COUNT': '4.62',\n", - " 'DIFFERENTIAL LEUCOCYTE COUNT': 'NA',\n", - " 'NEUTROPHILS': '66',\n", - " 'LYMPHOCYTES': '28',\n", - " 'MONOCYTES': '06',\n", - " 'EOSINOPHILS': '00',\n", - " 'BASOPHIL': '00',\n", - " 'PROMYELOCYTES': 'NA',\n", - " 'MYELOCYTES': 'NA',\n", - " 'METAMYELOCYTES': 'NA',\n", - " 'BLASTS': 'NA',\n", - " 'PLATELETS': '195',\n", - " 'RETICULOCYTE COUNT': 'NA'}]],\n", - " [[{'NAME': 'TEST PATIENT 12',\n", - " 'HAEMOGLOBIN': '12.8',\n", - " 'RBC (RED CELLS COUNT)': '4.5',\n", - " 'HAEMATOCRIT(PCV)': '42',\n", - " 'MCV': '93',\n", - " 'MCH': '28',\n", - " 'MCHC': '31',\n", - " 'TOTAL LEUCOCYTE COUNT': '12.6',\n", - " 'DIFFERENTIAL LEUCOCYTE COUNT': 'NA',\n", - " 'NEUTROPHILS': '67',\n", - " 'LYMPHOCYTES': '23',\n", - " 'MONOCYTES': '07',\n", - " 'EOSINOPHILS': '03',\n", - " 'BASOPHIL': '00',\n", - " 'PROMYELOCYTES': 'NA',\n", - " 'MYELOCYTES': 'NA',\n", - " 'METAMYELOCYTES': 'NA',\n", - " 'BLASTS': 'NA',\n", - " 'PLATELETS': '210',\n", - " 'RETICULOCYTE COUNT': 'NA'}]],\n", - " [[{'NAME': 'Test Patient 14',\n", - " 'HAEMOGLOBIN': '14.8',\n", - " 'RBC (RED CELLS COUNT)': '5.22',\n", - " 'HAEMATOCRIT(PCV)': '45.2',\n", - " 'MCV': '86.6',\n", - " 'MCH': '28.4',\n", - " 'MCHC': '32.7',\n", - " 'TOTAL LEUCOCYTE COUNT': '5.45',\n", - " 'DIFFERENTIAL LEUCOCYTE COUNT': 'NA',\n", - " 'NEUTROPHILS': '64.3',\n", - " 'LYMPHOCYTES': '25.1',\n", - " 'MONOCYTES': '7.2',\n", - " 'EOSINOPHILS': '2.8',\n", - " 'BASOPHIL': '0.6',\n", - " 'PROMYELOCYTES': 'NA',\n", - " 'MYELOCYTES': 'NA',\n", - " 'METAMYELOCYTES': 'NA',\n", - " 'BLASTS': 'NA',\n", - " 'PLATELETS': '287',\n", - " 'RETICULOCYTE COUNT': 'NA'}]],\n", - " [[{'NAME': 'Test Patient 20',\n", - " 'HAEMOGLOBIN': '11.3',\n", - " 'RBC (RED CELLS COUNT)': '4.2',\n", - " 'HAEMATOCRIT(PCV)': '35',\n", + "[[[{'SCL No': '1232475',\n", + " 'Name': 'Test Patient A',\n", + " 'Contact #': '03443730681',\n", + " 'Ward Name': 'NA',\n", + " 'Admission#': 'NA',\n", + " 'Collection Time': '15-Jan-202410:19',\n", + " 'REF BY': 'Test Doctor A',\n", + " 'Haemoglobin': '12.6 gm/dl',\n", + " 'Haematocrit': '38.1 %',\n", + " 'R.B.C': '4.54 10E12/L',\n", + " 'M.C.V': '83.9 FL',\n", + " 'M.C.H': '27,8 PG',\n", + " 'M.C.H.C': '33.1 %',\n", + " 'W.B.C': '4.62 x10E9/L',\n", + " 'Neutrophils': '66 %',\n", + " 'Lymphocytes': '28 %',\n", + " 'Eosinophils': '00 %',\n", + " 'Monocytes': '06 %',\n", + " 'Basophils': '00 %',\n", + " 'Platelet Count': '195 x10E9L',\n", + " 'Normochromic Normocytic': 'NA'}]],\n", + " [[{'MRNo': '231121-116431337',\n", + " 'InvoiceNo': '1115230939',\n", + " 'Name': 'TEST PATIENT 12',\n", + " 'Age/Gender': 'NA',\n", + " 'ReferredBy': 'SELF',\n", + " 'SampleDate': '22-Nov-2023',\n", + " 'HB': '12.8 gm/dL',\n", + " 'RBC': '4.5 x 10^12/L',\n", + " 'HCT': '42 %',\n", + " 'MCV': '93 fL',\n", + " 'MCH': '28 pg',\n", + " 'MCHC': '31 gm/dL',\n", + " 'WBC': '12.6 x 10e9/L',\n", + " 'PLATELETS': '210 x10^9/L',\n", + " 'NEUTROPHILS%': '67 %',\n", + " 'LYMPHOCYTES%': '23%',\n", + " 'MONOCYTES%': '07 %',\n", + " 'EOSINOPHILS%': '03 %',\n", + " 'BASOPHILS%': '00 %',\n", + " 'Remarks': 'Normocytic, normochromic. Leucocytosis. ? Cause. Clinical Correlation Advised'}]],\n", + " [[{'Lab No': '23020107031',\n", + " 'Name': 'Test Patient 14',\n", + " 'Age': 'NA',\n", + " 'Gender': 'NA',\n", + " 'Contact #': 'NA',\n", + " 'Location': 'NA',\n", + " 'Ward/ Bed': 'F-OPD',\n", + " 'Collection Time': '04-Mar-2023 11:13',\n", + " 'Reporting Time': '04-Mar-2023 12:14',\n", + " 'Haemoglobin': '14.8',\n", + " 'Haematocrit': '45.2',\n", + " 'M.C.V': '86.6',\n", + " 'M.C.H': '28.4',\n", + " 'M.C.H.C': '32.7',\n", + " 'RBC': '5.22',\n", + " 'Platelet Count': '287',\n", + " 'WBC': '5.45',\n", + " 'Neutrophils': '64.3',\n", + " 'Lymphocytes': '25.1',\n", + " 'Eosinophils': '2.8',\n", + " 'Monocytes': '7.2',\n", + " 'Basophils': '0.6',\n", + " 'ESR': '10'}]],\n", + " [[{'Name': 'Test Patient 20',\n", + " 'Age': 'NA',\n", + " 'Gender': 'NA',\n", + " 'TestDate': '10-June-2023',\n", + " 'TestDescription': 'Haematology',\n", + " 'Haemoglobin': '11.3',\n", + " 'RedBloodCellCount': '4.2',\n", + " 'Haematocrit': '35',\n", " 'MCV': '82',\n", " 'MCH': '27',\n", " 'MCHC': '33',\n", - " 'TOTAL LEUCOCYTE COUNT': '9,600',\n", - " 'DIFFERENTIAL LEUCOCYTE COUNT': 'NA',\n", - " 'NEUTROPHILS': '58',\n", - " 'LYMPHOCYTES': '40',\n", - " 'MONOCYTES': '01',\n", - " 'EOSINOPHILS': '01',\n", - " 'BASOPHIL': 'NA',\n", - " 'PROMYELOCYTES': 'NA',\n", - " 'MYELOCYTES': 'NA',\n", - " 'METAMYELOCYTES': 'NA',\n", - " 'BLASTS': 'NA',\n", - " 'PLATELETS': '238,000',\n", - " 'RETICULOCYTE COUNT': 'NA'}]]]" + " 'TotalLeukocyteCount': '9,600',\n", + " 'NeutrophilsPercentage': '58',\n", + " 'LymphocytesPercentage': '40',\n", + " 'MonocytesPercentage': '01',\n", + " 'EosinophilsPercentage': '01',\n", + " 'BasophilPercentage': 'NA',\n", + " 'Platelets': '238,000',\n", + " 'ReticulocyteCount': 'NA',\n", + " 'Normoblast': 'Normochromic. Anisocytosis.'}]]]" ] }, - "execution_count": 8, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -291,290 +295,6 @@ "qa_results" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Output Analysis\n", - "Now, we will analyze the output and compare it with the expected result. We'll take note of any missing keys, any additional keys added by OpenParser, and any incorrect values." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "---------------------------------------------------\n", - "REPORT 1\n", - "\n", - "Summary: {'missing_keys': [], 'additional_keys': [], 'incorrect_keys': []}\n", - "---------------------------------------------------\n", - "REPORT 2\n", - "\n", - "Summary: {'missing_keys': [], 'additional_keys': [], 'incorrect_keys': []}\n", - "---------------------------------------------------\n", - "REPORT 3\n", - "\n", - "Summary: {'missing_keys': [], 'additional_keys': [], 'incorrect_keys': []}\n", - "---------------------------------------------------\n", - "REPORT 4\n", - "\n", - "Summary: {'missing_keys': [], 'additional_keys': [], 'incorrect_keys': []}\n" - ] - } - ], - "source": [ - "values_list = []\n", - "\n", - "def compare_vals(str1, str2):\n", - " try:\n", - " num1 = float(str1)\n", - " num2 = float(str2)\n", - " return num1 == num2\n", - " except ValueError:\n", - " return str1 == str2\n", - "\n", - "for idx, qa_result in enumerate(qa_results):\n", - " print('---------------------------------------------------')\n", - " print(f\"REPORT {idx+1}\\n\")\n", - " data = qa_result[0]\n", - " keys = [list(item.keys()) for item in data][0]\n", - " values = [list(item.values()) for item in data][0]\n", - "\n", - " this_expected_result = expected_result[idx]\n", - " missing_keys = []\n", - " incorrect_keys = []\n", - " expected_keys = list(this_expected_result.keys())\n", - "\n", - " for key_idx, key in enumerate(expected_keys):\n", - " if key_idx < 0 or key_idx >= len(keys) or key != keys[key_idx]:\n", - " print(f\"Key {key} not found in report {idx+1}\")\n", - " missing_keys.append(key)\n", - " keys.insert(key_idx, key)\n", - " values.insert(key_idx, 'Missing')\n", - " elif not compare_vals(this_expected_result[key], values[keys.index(key)]):\n", - " print(f\"Incorrect {key}. Expected: {this_expected_result[key]}, Found: {values[keys.index(key)]}\")\n", - " incorrect_keys.append((key, this_expected_result[key], values[keys.index(key)]))\n", - " values[keys.index(key)] = f'{values[keys.index(key)]} * ({this_expected_result[key]})'\n", - "\n", - " different_keys = {'missing_keys': missing_keys, 'additional_keys': [], 'incorrect_keys': incorrect_keys}\n", - "\n", - " # Check for differences in keys\n", - " for key in keys:\n", - " if key not in input_keys:\n", - " different_keys['additional_keys'].append(key)\n", - " print(f\"Summary: {different_keys}\")\n", - "\n", - " # Create a DataFrame\n", - " values_list.append(values)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Last, we print out the table with all the values extracted from the report. If a value is missing, it's highlighted yellow. If it's an incorrect value, it's highlighted red with the expected value in parentheses." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
 CBC_ReportSample_1.pngCBC_ReportSample_12.jpegCBC_ReportSample_14.jpegCBC_ReportSample_20.jpeg
NAMETest Patient ATEST PATIENT 12Test Patient 14Test Patient 20
HAEMOGLOBIN12.612.814.811.3
RBC (RED CELLS COUNT)4.544.55.224.2
HAEMATOCRIT(PCV)38.14245.235
MCV83.99386.682
MCH27.82828.427
MCHC33.13132.733
TOTAL LEUCOCYTE COUNT4.6212.65.459,600
DIFFERENTIAL LEUCOCYTE COUNTNANANANA
NEUTROPHILS666764.358
LYMPHOCYTES282325.140
MONOCYTES06077.201
EOSINOPHILS00032.801
BASOPHIL00000.6NA
PROMYELOCYTESNANANANA
MYELOCYTESNANANANA
METAMYELOCYTESNANANANA
BLASTSNANANANA
PLATELETS195210287238,000
RETICULOCYTE COUNTNANANANA
\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = pd.DataFrame({\n", - " report_files[0]: values_list[0],\n", - " report_files[1]: values_list[1],\n", - " report_files[2]: values_list[2],\n", - " report_files[3]: values_list[3],\n", - "}, index=input_keys)\n", - "\n", - "def highlight_incorrect_keys(val):\n", - " if val == 'Missing':\n", - " return 'background-color: yellow; color: black'\n", - " elif '*' in val:\n", - " return 'background-color: red'\n", - " return ''\n", - "\n", - "# Apply the style to the DataFrame\n", - "styled_df = df.style.map(highlight_incorrect_keys)\n", - "\n", - "# Display the styled DataFrame\n", - "styled_df\n" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -605,7 +325,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.12.2" } }, "nbformat": 4,