First draft (object access isn't right yet)

SuffolkLITLab · Nov 13, 2023 · bb563bb · bb563bb
1 parent 5616b72
commit bb563bb
Showing 1 changed file with 322 additions and 0 deletions.
diff --git a/formfyxer/docx_wrangling.py b/formfyxer/docx_wrangling.py
@@ -0,0 +1,322 @@
+import docx
+import sys
+import os
+from openai import OpenAI
+
+
+import tiktoken
+import json
+from docx.oxml import OxmlElement
+import re
+
+# os.environ["OPENAI_API_KEY"] = "sk-..."
+
+client = OpenAI()
+
+
+from typing import List, Tuple, Optional
+
+__all__ = [
+    "get_labeled_docx_runs",
+    "update_docx",
+    "modify_docx_with_openai_guesses",
+]
+
+def add_paragraph_after(paragraph, text):
+    p = OxmlElement('w:p')
+    p.text = text
+    paragraph._element.addnext(p)
+
+
+def add_paragraph_before(paragraph, text):
+    p = OxmlElement('w:p')
+    p.text = text
+    paragraph._element.addprevious(p)
+
+
+def update_docx(document: docx.Document, modified_runs: Tuple[int,int,str,str,int] ) -> docx.Document:
+    """Update the document with the modified runs.
+
+    Args:
+        document: the docx.Document object
+        modified_runs: a tuple of paragraph number, run number, the modified text, a question (not used), and whether a new paragraph should be inserted (for conditional text)
+
+    Returns:
+        The modified document.
+    """
+    # Sort modified_runs in reverse order so inserted paragraphs are in the correct order
+    modified_runs = sorted(modified_runs, key=lambda x: x[0], reverse=True)
+
+    # also sort each run in the modified_runs so that the runs are in the correct order
+    modified_runs = sorted(modified_runs, key=lambda x: x[1], reverse=True)
+
+    for paragraph_number, run_number, modified_text, question, new_paragraph in modified_runs:
+        paragraph = document.paragraphs[paragraph_number]
+        run = paragraph.runs[run_number]
+        if new_paragraph == 1:
+            add_paragraph_after(paragraph, modified_text)
+        elif new_paragraph == -1:
+            add_paragraph_before(paragraph, modified_text)
+        else:
+            run.text = modified_text
+    return document
+
+
+def clean_pseudo_json(string:str) -> dict:
+    """Use OpenAI to try to fix a broken JSON string.
+    """
+    try:
+        output = json.loads(string)
+    except:
+        try:
+            # first just try removing newlines
+            string_4_json = re.findall("\{.*\}",re.sub("\n","",string))[0]
+            output = json.loads(string_4_json)
+        except:
+            try:
+                # then try adding curly braces
+                string = "{"+string+"}"
+                string_4_json = re.findall("\{.*\}",re.sub("\n","",string))[0]
+                output = json.loads(string_4_json)
+            except Exception as e:
+                prompt = f"""
+                Invalid JSON string:
+                ```
+                {string}
+                ```
+
+                Error: '{e}'
+                """
+                system_prompt = "You are a JSON fixing robot. You will accept a broken JSON string as input, along with an error message, and return a new parsable JSON string. Your response will only contain JSON and nothing else."
+                encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
+                tokens_used = len(encoding.encode(system_prompt + prompt))
+
+                if tokens_used < 2048:
+                    model = "gpt-3.5-turbo"
+                    max_tokens = 4000 - tokens_used
+                else:
+                    model = "gpt-4-1106-preview"
+                    max_tokens = 4096
+                    response = client.chat.completions.create(model=model,
+                    messages=[
+                        {
+                            "role": "system",
+                            "content": system_prompt
+                        },
+                        {
+                            "role": "user",
+                            "content": prompt
+                        }
+                        ],
+                    temperature=0.0,
+                    max_tokens=tiktoken,
+                    top_p=1,
+                    frequency_penalty=0.0,
+                    presence_penalty=0.0)
+                    try:
+                        output = json.loads(response.choices[0].message.content)
+                    except:
+                        output = dict()
+
+    return output
+
+
+def get_labeled_docx_runs(docx_path: str, custom_people_names: Optional[Tuple[str, str]] = None) -> List[Tuple[int, int, str]]:
+    """Scan the DOCX and return a list of modified text with Jinja2 variable names inserted.
+
+    Args:
+        docx_path: path to the DOCX file
+        custom_people_names: a tuple of custom names and descriptions to use in addition to the default ones. Like: ("clients", "the person benefiting from the form")
+
+    Returns:
+        A list of tuples, each containing a paragraph number, run number, and the modified text of the run.
+    """
+
+    role_description = """
+    You will process a DOCX document and return a JSON structure that turns the DOCX file into a template 
+    based on the following guidelines and examples. The DOCX will be provided as an annotated series of
+    paragraphs and runs.
+
+    Steps:
+    1. Analyze the document. Identify placeholder text and repeated _____ that should be replaced with a variable name.
+    2. Insert jinja2 tags around a new variable name that represents the placeholder text.
+    3. Add a draft of a short question to go with the variable.
+    3. Mark optional paragraphs with conditional Jinja2 tags.
+    4. Text intended for verbatim output in the final document will remain unchanged.
+    6. The result will be a JSON structure that indicates which paragraphs and runs in the DOCX require modifications,
+    the new text of the modified run with Jinja2 inserted, and a draft question to provide a definition of the variable.
+
+    Example input, with paragraph and run numbers indicated:
+    [
+        [0, 1, "Dear John Smith:"],
+        [1, 0, "This sentence can stay as is in the output and will not be in the reply."],
+        [2, 0, "[Optional: if you are a tenant, include this paragraph]"],
+    ]
+
+    Example reply, indicating paragraph, run, the new text, a question, and a number indicating if this changes the 
+    current paragraph, adds one before, or adds one after (-1, 0, 1):
+
+    {
+        "results": [
+            [0, 1, "Dear {{ other_parties[0] }}:", "Recipient name", 0],
+            [2, 0, "{%p if is_tenant %}", "Are you a tenant?", -1],
+            [3, 0, "{%p endif %}", "", 1],
+        ]
+    }
+
+    The reply is in JSON format with no other reply, and ONLY contains the runs that have modified text.
+    """
+
+    custom_name_text = ""
+    if custom_people_names:
+        for name, description in custom_people_names:
+            custom_name_text += f"    {name} ({description}), \n"
+
+    rules = f"""
+    Rules for variable names:
+        1. Variables usually refer to people or their attributes.
+        2. People are stored in lists.
+        3. We use Docassemble objects and conventions.
+        4. Use variable names and patterns from the list below. Invent new variable names when it is appropriate.
+
+    List names for people:
+        {custom_people_names}
+        users (for the person benefiting from the form, especially when for a pro se filer)
+        other_parties (the opposing party in a lawsuit or transactional party)
+        plaintiffs
+        defendants
+        petitioners
+        respondents
+        children
+        spouses
+        parents
+        caregivers
+        attorneys
+        translators
+        debt_collectors
+        creditors
+        witnesses
+        guardians_ad_litem
+        guardians
+        decedents
+        interested_parties
+
+        Name Forms:
+            users (full name of all users)
+            users[0] (Full name)
+            users[0].name.first (First name only)
+            users[0].name.middle (Middle name only)
+            users[0].name.middle_initial() (First letter of middle name)
+            users[0].name.last (Last name only)
+            users[0].name.suffix (Suffix of user's name only)
+
+    Attribute names (replace `users` with the appropriate list name):
+        Demographic Data:
+            users[0].birthdate (Birthdate)
+            users[0].age_in_years() (Calculated age based on birthdate)
+            users[0].gender (Gender)
+            users[0].gender_female (User is female, for checkbox field)
+            users[0].gender_male (User is male, for checkbox field)
+            users[0].gender_other (User is not male or female, for checkbox field)
+            users[0].gender_nonbinary (User identifies as nonbinary, for checkbox field)
+            users[0].gender_undisclosed (User chose not to disclose gender, for checkbox field)
+            users[0].gender_self_described (User chose to self-describe gender, for checkbox field)
+            user_needs_interpreter (User needs an interpreter, for checkbox field)
+            user_preferred_language (User's preferred language)
+
+        Addresses:
+            users[0].address.block() (Full address, on multiple lines)
+            users[0].address.on_one_line() (Full address on one line)
+            users[0].address.line_one() (Line one of the address, including unit or apartment number)
+            users[0].address.line_two() (Line two of the address, usually city, state, and Zip/postal code)
+            users[0].address.address (Street address)
+            users[0].address.unit (Apartment, unit, or suite)
+            users[0].address.city (City or town)
+            users[0].address.state (State, province, or sub-locality)
+            users[0].address.zip (Zip or postal code)
+            users[0].address.county (County or parish)
+            users[0].address.country (Country)
+
+        Other Contact Information:
+            users[0].phone_number (Phone number)
+            users[0].mobile_number (A phone number explicitly labeled as the "mobile" number)
+            users[0].phone_numbers() (A list of both mobile and other phone numbers)
+            users[0].email (Email)
+
+        Signatures:
+            users[0].signature (Signature)
+            signature_date (Date the form is completed)
+
+        Information about Court and Court Processes:
+            trial_court (Court's full name)
+            trial_court.address.county (County where court is located)
+            trial_court.division (Division of court)
+            trial_court.department (Department of court)
+            docket_number (Case or docket number)
+            docket_numbers (A comma-separated list of docket numbers)
+            
+    When No Existing Variable Name Exists:
+        1. Craft short, readable variable names in python snake_case.
+        2. Represent people with lists, even if only one person.
+        3. Use valid Python variable names within complete Jinja2 tags, like: {{ new_variable_name }}.
+
+        Special endings:
+            Suffix _date for date values.
+            Suffix _value or _amount for currency values.
+
+        Examples: 
+        "(State the reason for eviction)" transforms into `{{ eviction_reason }}`.
+    """
+    encoding = tiktoken.encoding_for_model("gpt-4")
+
+    doc = docx.Document(docx_path)
+
+    items = []
+    for pnum, para in enumerate(doc.paragraphs):
+        for rnum, run in enumerate(para.runs):
+            items.append([pnum, rnum, run.text])
+
+    encoding = tiktoken.encoding_for_model("gpt-4")
+    token_count = len(encoding.encode(role_description + rules + repr(items)))
+
+    response = client.chat.completions.create(model="gpt-4-1106-preview",
+    messages=[
+        {
+            "role": "system",
+            "content": role_description + rules
+        },
+        {
+        "role": "user",
+        "content": repr(items)
+        }
+    ],
+    temperature=.5,
+    max_tokens=4096,
+    top_p=1,
+    frequency_penalty=0,
+    presence_penalty=0)
+
+    try:
+        guesses = json.loads(response.choices[0].message.content)
+    except:
+        guesses = clean_pseudo_json(response.choices[0].message.content)
+
+
+def modify_docx_with_openai_guesses(docx_path: str) -> docx.Document:
+    """Uses OpenAI to guess the variable names for a document and then modifies the document with the guesses.
+
+    Args:
+        docx_path (str): Path to the DOCX file to modify.
+    
+    Returns:
+        docx.Document: The modified document, ready to be saved to the same or a new path
+    """
+    guesses = get_labeled_docx_runs(docx_path)
+
+    return update_docx(docx_path, guesses)
+
+
+# Accept the filename from the commandline
+if __name__ == "__main__":
+    new_doc = modify_docx_with_openai_guesses(sys.argv[1])
+    new_doc.save(sys.argv[1] + ".new.docx")