-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
First draft (object access isn't right yet)
- Loading branch information
1 parent
5616b72
commit bb563bb
Showing
1 changed file
with
322 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,322 @@ | ||
import docx | ||
import sys | ||
import os | ||
from openai import OpenAI | ||
|
||
|
||
import tiktoken | ||
import json | ||
from docx.oxml import OxmlElement | ||
import re | ||
|
||
# os.environ["OPENAI_API_KEY"] = "sk-..." | ||
|
||
client = OpenAI() | ||
|
||
|
||
from typing import List, Tuple, Optional | ||
|
||
__all__ = [ | ||
"get_labeled_docx_runs", | ||
"update_docx", | ||
"modify_docx_with_openai_guesses", | ||
] | ||
|
||
def add_paragraph_after(paragraph, text): | ||
p = OxmlElement('w:p') | ||
p.text = text | ||
paragraph._element.addnext(p) | ||
|
||
|
||
def add_paragraph_before(paragraph, text): | ||
p = OxmlElement('w:p') | ||
p.text = text | ||
paragraph._element.addprevious(p) | ||
|
||
|
||
def update_docx(document: docx.Document, modified_runs: Tuple[int,int,str,str,int] ) -> docx.Document: | ||
"""Update the document with the modified runs. | ||
Args: | ||
document: the docx.Document object | ||
modified_runs: a tuple of paragraph number, run number, the modified text, a question (not used), and whether a new paragraph should be inserted (for conditional text) | ||
Returns: | ||
The modified document. | ||
""" | ||
# Sort modified_runs in reverse order so inserted paragraphs are in the correct order | ||
modified_runs = sorted(modified_runs, key=lambda x: x[0], reverse=True) | ||
|
||
# also sort each run in the modified_runs so that the runs are in the correct order | ||
modified_runs = sorted(modified_runs, key=lambda x: x[1], reverse=True) | ||
|
||
for paragraph_number, run_number, modified_text, question, new_paragraph in modified_runs: | ||
paragraph = document.paragraphs[paragraph_number] | ||
run = paragraph.runs[run_number] | ||
if new_paragraph == 1: | ||
add_paragraph_after(paragraph, modified_text) | ||
elif new_paragraph == -1: | ||
add_paragraph_before(paragraph, modified_text) | ||
else: | ||
run.text = modified_text | ||
return document | ||
|
||
|
||
def clean_pseudo_json(string:str) -> dict: | ||
"""Use OpenAI to try to fix a broken JSON string. | ||
""" | ||
try: | ||
output = json.loads(string) | ||
except: | ||
try: | ||
# first just try removing newlines | ||
string_4_json = re.findall("\{.*\}",re.sub("\n","",string))[0] | ||
output = json.loads(string_4_json) | ||
except: | ||
try: | ||
# then try adding curly braces | ||
string = "{"+string+"}" | ||
string_4_json = re.findall("\{.*\}",re.sub("\n","",string))[0] | ||
output = json.loads(string_4_json) | ||
except Exception as e: | ||
prompt = f""" | ||
Invalid JSON string: | ||
``` | ||
{string} | ||
``` | ||
Error: '{e}' | ||
""" | ||
system_prompt = "You are a JSON fixing robot. You will accept a broken JSON string as input, along with an error message, and return a new parsable JSON string. Your response will only contain JSON and nothing else." | ||
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") | ||
tokens_used = len(encoding.encode(system_prompt + prompt)) | ||
|
||
if tokens_used < 2048: | ||
model = "gpt-3.5-turbo" | ||
max_tokens = 4000 - tokens_used | ||
else: | ||
model = "gpt-4-1106-preview" | ||
max_tokens = 4096 | ||
response = client.chat.completions.create(model=model, | ||
messages=[ | ||
{ | ||
"role": "system", | ||
"content": system_prompt | ||
}, | ||
{ | ||
"role": "user", | ||
"content": prompt | ||
} | ||
], | ||
temperature=0.0, | ||
max_tokens=tiktoken, | ||
top_p=1, | ||
frequency_penalty=0.0, | ||
presence_penalty=0.0) | ||
try: | ||
output = json.loads(response.choices[0].message.content) | ||
except: | ||
output = dict() | ||
|
||
return output | ||
|
||
|
||
def get_labeled_docx_runs(docx_path: str, custom_people_names: Optional[Tuple[str, str]] = None) -> List[Tuple[int, int, str]]: | ||
"""Scan the DOCX and return a list of modified text with Jinja2 variable names inserted. | ||
Args: | ||
docx_path: path to the DOCX file | ||
custom_people_names: a tuple of custom names and descriptions to use in addition to the default ones. Like: ("clients", "the person benefiting from the form") | ||
Returns: | ||
A list of tuples, each containing a paragraph number, run number, and the modified text of the run. | ||
""" | ||
|
||
role_description = """ | ||
You will process a DOCX document and return a JSON structure that turns the DOCX file into a template | ||
based on the following guidelines and examples. The DOCX will be provided as an annotated series of | ||
paragraphs and runs. | ||
Steps: | ||
1. Analyze the document. Identify placeholder text and repeated _____ that should be replaced with a variable name. | ||
2. Insert jinja2 tags around a new variable name that represents the placeholder text. | ||
3. Add a draft of a short question to go with the variable. | ||
3. Mark optional paragraphs with conditional Jinja2 tags. | ||
4. Text intended for verbatim output in the final document will remain unchanged. | ||
6. The result will be a JSON structure that indicates which paragraphs and runs in the DOCX require modifications, | ||
the new text of the modified run with Jinja2 inserted, and a draft question to provide a definition of the variable. | ||
Example input, with paragraph and run numbers indicated: | ||
[ | ||
[0, 1, "Dear John Smith:"], | ||
[1, 0, "This sentence can stay as is in the output and will not be in the reply."], | ||
[2, 0, "[Optional: if you are a tenant, include this paragraph]"], | ||
] | ||
Example reply, indicating paragraph, run, the new text, a question, and a number indicating if this changes the | ||
current paragraph, adds one before, or adds one after (-1, 0, 1): | ||
{ | ||
"results": [ | ||
[0, 1, "Dear {{ other_parties[0] }}:", "Recipient name", 0], | ||
[2, 0, "{%p if is_tenant %}", "Are you a tenant?", -1], | ||
[3, 0, "{%p endif %}", "", 1], | ||
] | ||
} | ||
The reply is in JSON format with no other reply, and ONLY contains the runs that have modified text. | ||
""" | ||
|
||
custom_name_text = "" | ||
if custom_people_names: | ||
for name, description in custom_people_names: | ||
custom_name_text += f" {name} ({description}), \n" | ||
|
||
rules = f""" | ||
Rules for variable names: | ||
1. Variables usually refer to people or their attributes. | ||
2. People are stored in lists. | ||
3. We use Docassemble objects and conventions. | ||
4. Use variable names and patterns from the list below. Invent new variable names when it is appropriate. | ||
List names for people: | ||
{custom_people_names} | ||
users (for the person benefiting from the form, especially when for a pro se filer) | ||
other_parties (the opposing party in a lawsuit or transactional party) | ||
plaintiffs | ||
defendants | ||
petitioners | ||
respondents | ||
children | ||
spouses | ||
parents | ||
caregivers | ||
attorneys | ||
translators | ||
debt_collectors | ||
creditors | ||
witnesses | ||
guardians_ad_litem | ||
guardians | ||
decedents | ||
interested_parties | ||
Name Forms: | ||
users (full name of all users) | ||
users[0] (Full name) | ||
users[0].name.first (First name only) | ||
users[0].name.middle (Middle name only) | ||
users[0].name.middle_initial() (First letter of middle name) | ||
users[0].name.last (Last name only) | ||
users[0].name.suffix (Suffix of user's name only) | ||
Attribute names (replace `users` with the appropriate list name): | ||
Demographic Data: | ||
users[0].birthdate (Birthdate) | ||
users[0].age_in_years() (Calculated age based on birthdate) | ||
users[0].gender (Gender) | ||
users[0].gender_female (User is female, for checkbox field) | ||
users[0].gender_male (User is male, for checkbox field) | ||
users[0].gender_other (User is not male or female, for checkbox field) | ||
users[0].gender_nonbinary (User identifies as nonbinary, for checkbox field) | ||
users[0].gender_undisclosed (User chose not to disclose gender, for checkbox field) | ||
users[0].gender_self_described (User chose to self-describe gender, for checkbox field) | ||
user_needs_interpreter (User needs an interpreter, for checkbox field) | ||
user_preferred_language (User's preferred language) | ||
Addresses: | ||
users[0].address.block() (Full address, on multiple lines) | ||
users[0].address.on_one_line() (Full address on one line) | ||
users[0].address.line_one() (Line one of the address, including unit or apartment number) | ||
users[0].address.line_two() (Line two of the address, usually city, state, and Zip/postal code) | ||
users[0].address.address (Street address) | ||
users[0].address.unit (Apartment, unit, or suite) | ||
users[0].address.city (City or town) | ||
users[0].address.state (State, province, or sub-locality) | ||
users[0].address.zip (Zip or postal code) | ||
users[0].address.county (County or parish) | ||
users[0].address.country (Country) | ||
Other Contact Information: | ||
users[0].phone_number (Phone number) | ||
users[0].mobile_number (A phone number explicitly labeled as the "mobile" number) | ||
users[0].phone_numbers() (A list of both mobile and other phone numbers) | ||
users[0].email (Email) | ||
Signatures: | ||
users[0].signature (Signature) | ||
signature_date (Date the form is completed) | ||
Information about Court and Court Processes: | ||
trial_court (Court's full name) | ||
trial_court.address.county (County where court is located) | ||
trial_court.division (Division of court) | ||
trial_court.department (Department of court) | ||
docket_number (Case or docket number) | ||
docket_numbers (A comma-separated list of docket numbers) | ||
When No Existing Variable Name Exists: | ||
1. Craft short, readable variable names in python snake_case. | ||
2. Represent people with lists, even if only one person. | ||
3. Use valid Python variable names within complete Jinja2 tags, like: {{ new_variable_name }}. | ||
Special endings: | ||
Suffix _date for date values. | ||
Suffix _value or _amount for currency values. | ||
Examples: | ||
"(State the reason for eviction)" transforms into `{{ eviction_reason }}`. | ||
""" | ||
encoding = tiktoken.encoding_for_model("gpt-4") | ||
|
||
doc = docx.Document(docx_path) | ||
|
||
items = [] | ||
for pnum, para in enumerate(doc.paragraphs): | ||
for rnum, run in enumerate(para.runs): | ||
items.append([pnum, rnum, run.text]) | ||
|
||
encoding = tiktoken.encoding_for_model("gpt-4") | ||
token_count = len(encoding.encode(role_description + rules + repr(items))) | ||
|
||
response = client.chat.completions.create(model="gpt-4-1106-preview", | ||
messages=[ | ||
{ | ||
"role": "system", | ||
"content": role_description + rules | ||
}, | ||
{ | ||
"role": "user", | ||
"content": repr(items) | ||
} | ||
], | ||
temperature=.5, | ||
max_tokens=4096, | ||
top_p=1, | ||
frequency_penalty=0, | ||
presence_penalty=0) | ||
|
||
try: | ||
guesses = json.loads(response.choices[0].message.content) | ||
except: | ||
guesses = clean_pseudo_json(response.choices[0].message.content) | ||
|
||
|
||
def modify_docx_with_openai_guesses(docx_path: str) -> docx.Document: | ||
"""Uses OpenAI to guess the variable names for a document and then modifies the document with the guesses. | ||
Args: | ||
docx_path (str): Path to the DOCX file to modify. | ||
Returns: | ||
docx.Document: The modified document, ready to be saved to the same or a new path | ||
""" | ||
guesses = get_labeled_docx_runs(docx_path) | ||
|
||
return update_docx(docx_path, guesses) | ||
|
||
|
||
# Accept the filename from the commandline | ||
if __name__ == "__main__": | ||
new_doc = modify_docx_with_openai_guesses(sys.argv[1]) | ||
new_doc.save(sys.argv[1] + ".new.docx") |