-
Notifications
You must be signed in to change notification settings - Fork 26
/
Copy pathapp.py
154 lines (126 loc) · 5.44 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
from flask import Flask, jsonify, request
from uuid import uuid4
from dotenv import load_dotenv
import openai
import os
import pandas as pd
import tiktoken
from urllib.parse import urlparse
import asyncio
import aiohttp
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from utils.webcrawl import crawl, remove_newlines, split_into_many
from flask_cors import CORS
load_dotenv()
openai.api_key = os.environ.get("OPEN_AI_APIKEY")
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
PINECONE_API_ENV = os.environ.get("PINECONE_API_ENV")
app = Flask(__name__)
app.config["DEBUG"] = True
CORS(app, origins=["*"])
async def embed_text(text):
async with aiohttp.ClientSession() as session:
async with session.post(
url="https://api.openai.com/v1/embeddings",
headers={"Authorization": f"Bearer {os.environ.get('OPEN_AI_APIKEY')}"},
json={"model": "text-embedding-ada-002", "input": text},
) as response:
response = await response.json()
return response["data"][0]["embedding"]
@app.route("/", methods=["GET", "POST"])
async def hello_from_root():
business_name = request.json["business_name"]
business_idea = request.json["business_idea"]
websites = request.json["domains"]
domains = websites.split(",")
texts = []
for domain in domains:
root_domain = urlparse(domain).netloc
print("Crawling " + root_domain + "...")
await crawl(domain)
print("reading files...")
for file in os.listdir("text/" + root_domain + "/"):
with open("text/" + root_domain + "/" + file, "r") as f:
text = f.read()
# Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces.
texts.append(
(
urlparse(file)
.netloc.replace("-", " ")
.replace("_", " ")
.replace("#update", ""),
text,
)
)
print("Creating Dataframe...")
# Create a dataframe from the list of texts
df = pd.DataFrame(texts, columns=["fname", "text"])
# Set the text column to be the raw text with the newlines removed
df["text"] = df.fname + ". " + remove_newlines(df.text)
df.to_csv("processed/scraped.csv")
print("Created CSV file.")
tokenizer = tiktoken.get_encoding("cl100k_base")
df = pd.read_csv("processed/scraped.csv", index_col=0)
df.columns = ["title", "text"]
print("Encoding text...")
# Tokenize the text and save the number of tokens to a new column
df["n_tokens"] = df.text.apply(lambda x: len(tokenizer.encode(x)))
shortened = []
# Loop through the dataframe
for row in df.iterrows():
# If the text is None, go to the next row
if row[1]["text"] is None:
continue
# If the number of tokens is greater than the max number of tokens, split the text into chunks
if row[1]["n_tokens"] > 500:
text_chunks = split_into_many(row[1]["text"], 500, tokenizer)
shortened.extend(
[{"title": row[1]["title"], "text": chunk} for chunk in text_chunks]
)
# Otherwise, add the text, title, and url to the list of shortened texts
else:
shortened.append({"title": row[1]["title"], "text": row[1]["text"]})
df = pd.DataFrame(shortened, columns=["title", "text"])
df["n_tokens"] = df.text.apply(lambda x: len(tokenizer.encode(x)))
print("Embedding text... from OpenAI")
df["embeddings"] = await asyncio.gather(*(embed_text(text) for text in df.text))
df["embeddings"] = df["embeddings"].apply(np.array)
df.to_csv("processed/embeddings.csv")
# Add an 'id' column to the DataFrame
df["id"] = [str(uuid4()) for _ in range(len(df))]
# Fill null values in 'title' column with 'No Title'
df["title"] = df["title"].fillna("No Title")
embed_model = "text-embedding-ada-002"
user_input = f"""My business name is: {business_name} My business idea is: {business_idea}.
From the information that I have provided, please provide a financial plan month to month and roadmap to help.
Also list the disadvantages of the business idea."""
print("Embedding user query...")
embedding = openai.Embedding.create(input=user_input, engine=embed_model)["data"][
0
]["embedding"]
# Convert the embedding to a NumPy array
embedding = np.array(embedding)
df["similarities"] = df.embeddings.apply(
lambda x: cosine_similarity(x.reshape(1, -1), embedding.reshape(1, -1))[0][0]
)
res = df.sort_values("similarities", ascending=False).head(15)
contexts = res.text.tolist()
augmented_query = "\n\n---\n\n".join(contexts) + "\n\n-----\n\n" + user_input
# system message to assign role the model
system_msg = f"""You are a business analyst looking to create a business based off
of the weaknesses of the context provided. Provide a financial plan month to month and roadmap to help
create this business. Give 3 business name ideas with it.
Format this all in Markdown format.
"""
chat = openai.ChatCompletion.create(
model="gpt-4",
messages=[
{"role": "system", "content": system_msg},
{"role": "user", "content": augmented_query},
],
)
print(chat)
return jsonify(chat)
if __name__ == "__main__":
app.run(host="0.0.0.0")