diff --git a/explore_labeling_docx.ipynb b/explore_labeling_docx.ipynb index db4ea9b..7d065af 100644 --- a/explore_labeling_docx.ipynb +++ b/explore_labeling_docx.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 39, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -10,83 +10,99 @@ "import os\n", "import openai\n", "\n", - "openai.api_key = os.getenv(\"OPENAI_API_KEY\")\n" + "with open(\"openai_key.txt\", \"r\") as file:\n", + " openai.api_key = file.read().rstrip()" ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "doc = Document(\"test_documents/emergency_guardianship.docx\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(0, 0) Your name Default Paragraph Font\n", - "(1, 0) Your address Default Paragraph Font\n", - "(2, 0) Your telephone Default Paragraph Font\n", - "(2, 1) number Default Paragraph Font\n", - "(3, 0) Date Default Paragraph Font\n", - "(4, 0) Name of Merchant Default Paragraph Font\n", - "(5, 0) Merchant's address Default Paragraph Font\n", - "(7, 0) Dear Merchant: Default Paragraph Font\n", - "(9, 0) Under the provisions of Massachusetts General Laws, Chapter 93A, Section 9, I hereby make written Default Paragraph Font\n", - "(10, 0) demand for relief as outlined in that statute. Default Paragraph Font\n", - "(12, 0) On or about Default Paragraph Font\n", - "(12, 1) {DATE} Default Paragraph Font\n", - "(12, 2) , the following unfair or deceptive act occurred: Default Paragraph Font\n", - "(12, 3) Default Paragraph Font\n", - "(13, 0) {EXPLAIN WHAT HAPPENED} Default Paragraph Font\n", - "(15, 0) This unfair or deceptive act or practice is, in my opinion, declared unlawful by Section 2 of Chapter 93A, Default Paragraph Font\n", - "(16, 0) (you may want to give regulation number, if applicable) which reads as follows: Default Paragraph Font\n", - "(17, 0) {Quote text or section. Remember: You are not required to quote written regulations or laws to support Default Paragraph Font\n", - "(18, 0) the assertion that the merchant's conduct was unfair or deceptive; it is, however, desirable. You will want Default Paragraph Font\n", - "(19, 0) to include all the regulations which you believe were violated.} Default Paragraph Font\n", - "(21, 0) As a result of this unfair or deceptive act or practice, I suffered injury or loss of money as follows: Default Paragraph Font\n", - "(22, 0) {Indicate Injury or Money or PropertyLoss} Default Paragraph Font\n", - "(24, 0) Therefore, I hereby demand the following relief: Default Paragraph Font\n", - "(25, 0) {Indicate Relief, or Payment for Damages, Default Paragraph Font\n", - "(25, 1) Which Default Paragraph Font\n", - "(25, 2) is Sought} Default Paragraph Font\n", - "(27, 0) Chapter 93A gives you the opportunity to make a good-faith response to this letter within thirty (30) days. Default Paragraph Font\n", - "(28, 0) Your failure to do so-could subject you to triple damages, attorney's fees and costs if I decide to institute Default Paragraph Font\n", - "(29, 0) legal action. Default Paragraph Font\n", - "(31, 0) Sincerely, Default Paragraph Font\n", - "(33, 0) Your Name Default Paragraph Font\n" + "(0, 0) Your name\n", + "(1, 0) Your address\n", + "(2, 0) Your telephone \n", + "(2, 1) number\n", + "(3, 0) Date\n", + "(4, 0) Name of Merchant\n", + "(5, 0) Merchant's address\n", + "(7, 0) Dear Merchant:\n", + "(9, 0) Under the provisions of Massachusetts General Laws, Chapter 93A, Section 9, I hereby make written\n", + "(10, 0) demand for relief as outlined in that statute.\n", + "(12, 0) On or about \n", + "(12, 1) {date}\n", + "(12, 2) , the following unfair or deceptive act occurred:\n", + "(12, 3) \n", + "(13, 0) {EXPLAIN WHAT HAPPENED}\n", + "(15, 0) This unfair or deceptive act or practice is, in my opinion, declared unlawful by Section 2 of Chapter 93A,\n", + "(16, 0) (you may want to give regulation number, if applicable) which reads as follows:\n", + "(17, 0) {Quote text or section. Remember: You are not required to quote written regulations or laws to support\n", + "(18, 0) the assertion that the merchant's conduct was unfair or deceptive; it is, however, desirable. You will want\n", + "(19, 0) to include all the regulations which you believe were violated.}\n", + "(21, 0) As a result of this unfair or deceptive act or practice, I suffered injury or loss of money as follows:\n", + "(22, 0) {Indicate Injury or Money or PropertyLoss}\n", + "(24, 0) Therefore, I hereby demand the following relief:\n", + "(25, 0) {Indicate Relief, or Payment for Damages, \n", + "(25, 1) Which\n", + "(25, 2) is Sought}\n", + "(27, 0) Chapter 93A gives you the opportunity to make a good-faith response to this letter within thirty (30) days.\n", + "(28, 0) Your failure to do so-could subject you to triple damages, attorney's fees and costs if I decide to institute\n", + "(29, 0) legal action.\n", + "(31, 0) Sincerely,\n", + "(33, 0) Your Name\n" ] } ], "source": [ - "doc = Document(\"test_documents/93A_demand_letter_sample.docx\")\n", + "# doc = Document(\"test_documents/93A_demand_letter_sample.docx\")\n", "\n", - "for pnum, para in enumerate(doc.paragraphs):\n", - " for rnum, run in enumerate(para.runs):\n", - " print((pnum, rnum), run.text)" + "# for pnum, para in enumerate(doc.paragraphs):\n", + "# for rnum, run in enumerate(para.runs):\n", + "# print((pnum, rnum), run.text)" ] }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "role_description = \"\"\"\n", - "You will process a DOCX document and return a JSON structure that turns the DOCX file into a template based on the following guidelines and examples.\n", + "You will process a DOCX document and return a JSON structure that turns the DOCX file into a template \n", + "based on the following guidelines and examples.\n", "\n", "Steps:\n", "1. Analyze the document. Identify placeholder text and repeated _____ that should be replaced with a variable name.\n", - "2. Mark optional paragraphs with conditional Jinja2 tags.\n", - "3. Text intended for verbatim output in the final document will remain unchanged.\n", - "4. The result will be a JSON structure that indicates which paragraphs and runs in the DOCX require modifications.\n", + "2. Mark the start and end of the placeholder text so it can be entirely removed from the output.\n", + "3. Insert jinja2 tags around a new variable name that represents the placeholder text.\n", + "4. Mark optional paragraphs with conditional Jinja2 tags.\n", + "5. Text intended for verbatim output in the final document will remain unchanged.\n", + "6. The result will be a JSON structure that indicates which paragraphs and runs in the DOCX require modifications,\n", + " the start and end of the placeholder text, and the new variable in Jinja2 syntax.\n", "\n", "Example input, with paragraph and run numbers indicated:\n", "[\n", " [0, 1, \"Dear John Smith:\"],\n", - " [1, 0, \"This sentence can stay as is in the output and will not in the reply.\"],\n", + " [1, 0, \"This sentence can stay as is in the output and will not be in the reply.\"],\n", " [2, 0, \"[Optional: if you are a tenant, include this paragraph]\"],\n", "]\n", "\n", - "Example reply, indicating paragraph, run, the new text, the starting position and ending position of the text that will be replaced, and whether a new paragraph should be inserted (for conditional text):\n", + "Example reply, indicating paragraph, run, the new text, the starting position and ending position \n", + "of the placeholder text, and whether a new paragraph should be inserted (for conditional text):\n", + "\n", "{\n", " \"results\": [\n", " [0, 1, \"{{ other_parties[0] }}\", 5, 15, false],\n", @@ -95,7 +111,7 @@ " ]\n", "}\n", "\n", - "The reply ONLY contains the modified text and starting/ending positions for the replacements.\n", + "The reply ONLY contains the modified text and starting/ending positions of the placeholder text.\n", "Note that we want the starting/ending position of the REMOVED text. Not the new text.\n", "\"\"\"\n", "\n", @@ -197,12 +213,10 @@ }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ - "doc = Document(\"test_documents/93A_demand_letter_sample.docx\")\n", - "\n", "items = []\n", "for pnum, para in enumerate(doc.paragraphs):\n", " for rnum, run in enumerate(para.runs):\n", @@ -211,7 +225,7 @@ }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -219,24 +233,24 @@ "output_type": "stream", "text": [ "{\n", - " \"id\": \"chatcmpl-8AP48dp07Wgj7s17NiElnbjGGV79M\",\n", + " \"id\": \"chatcmpl-8CHwgE5Ai8YAWjm34ts2Qgju1lUnS\",\n", " \"object\": \"chat.completion\",\n", - " \"created\": 1697490324,\n", + " \"created\": 1697939610,\n", " \"model\": \"gpt-4-0613\",\n", " \"choices\": [\n", " {\n", " \"index\": 0,\n", " \"message\": {\n", " \"role\": \"assistant\",\n", - " \"content\": \"{\\n \\\"results\\\": [\\n [0, 0, \\\"{{ users[0].name }}\\\", 0, 9, false],\\n [1, 0, \\\"{{ users[0].address.block() }}\\\", 0, 12, false],\\n [2, 0, \\\"{{ users[0].phone_number }}\\\", 0, 15, false],\\n [3, 0, \\\"{{ signature_date }}\\\", 0, 4, false],\\n [4, 0, \\\"{{ other_parties[0].name }}\\\", 0, 15, false],\\n [5, 0, \\\"{{ other_parties[0].address.block() }}\\\", 0, 17, false],\\n [7, 0, \\\"Dear {{ other_parties[0].name }}:\\\", 0, 14, false],\\n [12, 1, \\\"{{ incident_date }}\\\", 0, 6, false],\\n [13, 0, \\\"{{ incident_description }}\\\", 0, 21, false],\\n [16, 0, \\\"{{ applicable_regulation_number }}\\\", 0, 55, false],\\n [17, 0, \\\"{{ regulation_quote }}\\\", 0, 104, false],\\n [22, 0, \\\"{{ injury_or_loss_description }}\\\", 0, 33, false],\\n [25, 0, \\\"{{ demanded_relief }}\\\", 0, 29, false],\\n [31, 0, \\\"Sincerely,\\\", 0, 9, false],\\n [33, 0, \\\"{{ users[0].name }}\\\", 0, 9, false]\\n ]\\n}\"\n", + " \"content\": \"{\\\"results\\\": [[0, 0, \\\"Superior Court of Washington, County of {{ trial_court.address.county }}\\\", 39, 39, false], [2, 3, \\\"{{ other_parties[0].name.first }}\\\", 2, 2, false], [2, 5, \\\"{{ children[0].name.first }}\\\", 2, 5, false], [2, 9, \\\"{{ guardians[0].name.first }}\\\", 2, 9, false], [2, 11, \\\"{{ other_parties[0].name.first }}\\\", 2, 11, false], [2, 14, \\\"{{ other_parties[0].name.first }}\\\", 2, 14, false], [2, 17, \\\"{{ other_parties[0].name.first }}\\\", 2, 17, false], [2, 20, \\\"{{ other_parties[0].name.first }}\\\", 2, 20, false], [2, 22, \\\"{{ other_parties[0].name.first }}\\\", 2, 22, false], [4, 0, \\\"{{ hearing_date }}\\\", 0, 0, false], [4, 4, \\\"{{ hearing_location }}\\\", 4, 4, false], [4, 6, \\\"{{ hearing_time }}\\\", 6, 6, false], [6, 1, \\\"{{ trial_court.address.on_one_line() }}\\\", 4, 4, false], [8, 3, \\\"{{ court_room }}\\\", 3, 3, false], [10, 2, \\\"{{ judge_name }}\\\", 2, 2, false], [13, 3, \\\"{{ petitioners[0].name.first }}\\\", 3, 3, false], [14, 3, \\\"{{ proposed_guardians[0].name.first }}\\\", 3, 3, false], [15, 3, \\\"{{ children[0].name.first }}\\\", 3, 3, false], [29, 1, \\\"{{ trial_court.address.county }}\\\", 1, 1, false], [31, 0, \\\"{{ trial_court.address.on_one_line() }}\\\", 0, 3, false], [35, 0, \\\"{{ petitioners[0].name.first }}\\\", 0, 0, false], [35, 2, \\\"{{ signature_date }}\\\", 2, 2, false], [37, 2, \\\"{{ petitioners[0].email }}\\\", 2, 2, false], [38, 1, \\\"{{ petitioners[0].phone_number }}\\\", 1, 1, false], [39, 2, \\\"{{ petitioners[0].address.on_one_line() }}\\\", 2, 2, false], [41, 0, \\\"{{ petitioners[0].address.on_one_line() }}\\\", 0, 3, false]]}\"\n", " },\n", " \"finish_reason\": \"stop\"\n", " }\n", " ],\n", " \"usage\": {\n", - " \"prompt_tokens\": 1764,\n", - " \"completion_tokens\": 347,\n", - " \"total_tokens\": 2111\n", + " \"prompt_tokens\": 3576,\n", + " \"completion_tokens\": 616,\n", + " \"total_tokens\": 4192\n", " }\n", "}\n" ] @@ -263,7 +277,7 @@ " }\n", " ],\n", " temperature=.5,\n", - " max_tokens=3900 - token_count,\n", + " max_tokens=8000 - token_count,\n", " top_p=1,\n", " frequency_penalty=0,\n", " presence_penalty=0,\n", @@ -274,28 +288,39 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{{ users[0].name }}\n", - "{{ users[0].address.block() }}\n", - "{{ users[0].phone_number }}\n", + "Superior Court of Washington, County of {{ trial_court.address.county }}\n", + "{{ other_parties[0].name.first }}\n", + "{{ children[0].name.first }}\n", + "{{ guardians[0].name.first }}\n", + "{{ other_parties[0].name.first }}\n", + "{{ other_parties[0].name.first }}\n", + "{{ other_parties[0].name.first }}\n", + "{{ other_parties[0].name.first }}\n", + "{{ other_parties[0].name.first }}\n", + "{{ hearing_date }}\n", + "{{ hearing_location }}\n", + "{{ hearing_time }}\n", + "{{ trial_court.address.on_one_line() }}\n", + "{{ court_room }}\n", + "{{ judge_name }}\n", + "{{ petitioners[0].name.first }}\n", + "{{ proposed_guardians[0].name.first }}\n", + "{{ children[0].name.first }}\n", + "{{ trial_court.address.county }}\n", + "{{ trial_court.address.on_one_line() }}\n", + "{{ petitioners[0].name.first }}\n", "{{ signature_date }}\n", - "{{ other_parties[0].name }}\n", - "{{ other_parties[0].address.block() }}\n", - "Dear {{ other_parties[0].name }}:\n", - "{{ incident_date }}\n", - "{{ incident_description }}\n", - "{{ applicable_regulation_number }}\n", - "{{ regulation_quote }}\n", - "{{ injury_or_loss_description }}\n", - "{{ demanded_relief }}\n", - "Sincerely,\n", - "{{ users[0].name }}\n" + "{{ petitioners[0].email }}\n", + "{{ petitioners[0].phone_number }}\n", + "{{ petitioners[0].address.on_one_line() }}\n", + "{{ petitioners[0].address.on_one_line() }}\n" ] } ], @@ -315,7 +340,7 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -342,7 +367,7 @@ "\n", "# Don't do the new paragraphs just yet, that will come next\n", "\n", - "doc.save(\"test_documents/93A_demand_letter_sample_output.docx\")" + "doc.save(\"test_documents/guardianship_output.docx\")" ] } ], diff --git a/test_documents/emergency_guardianship.docx b/test_documents/emergency_guardianship.docx new file mode 100644 index 0000000..99fcdb6 Binary files /dev/null and b/test_documents/emergency_guardianship.docx differ