forked from langchain-ai/langchain
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdemo.py
332 lines (273 loc) · 12.9 KB
/
demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
# source:
# https://github.com/gkamradt/langchain-tutorials/blob/main/LangChain%20Cookbook%20Part%202%20-%20Use%20Cases.ipynb
# https://github.com/langchain-ai/langchain/blob/490ad93b3cf7d24b30f8993f860b654ff107e638/docs/extras/integrations/toolkits/pandas.ipynb#L8
# pip install langchain openai tiktoken faiss-cpu faiss-gpu tabulate dotenv
# In windows use: "C:\Users\<user>\AppData\Local\Programs\Python\Python311\python.exe" -m pip install pandas --proxy http://proxy.charite.de:8080
from dotenv import load_dotenv
import os, json
import pandas as pd
from langchain.llms import OpenAI
from langchain import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
# The vectorstore we'll be using
from langchain.vectorstores import FAISS
# The LangChain component we'll use to get the documents
from langchain.chains import RetrievalQA
# The easy document loader for text
from langchain.document_loaders import TextLoader
# The embedding engine that will convert our text to vectors
from langchain.embeddings.openai import OpenAIEmbeddings
# To help construct our Chat Messages
from langchain.schema import HumanMessage
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate
# We will be using a chat model, defaults to gpt-3.5-turbo
from langchain.chat_models import ChatOpenAI
# To parse outputs and get structured data back
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
# Text splitters
from langchain.text_splitter import CharacterTextSplitter
from langchain.agents import create_pandas_dataframe_agent
from langchain.agents.agent_types import AgentType
with open('../config.txt', 'r', encoding='utf-8') as file:
# Step 2: Reading the file
file_content = file.read()
api_key =""
proxy =""
for x in file_content.splitlines():
if x.startswith("OPENAI_API_KEY"):
api_key = x.split(" = ")[1].strip()
if x.startswith("proxy"):
proxy = x.split(" = ")[1].strip()
print(api_key)
print(proxy)
#************************* 1. Summarization
def example_1_1():
print("=========== Summaries Of Short Text ==================")
load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY', api_key)
# Note, the default model is already 'text-davinci-003' but I call it out here explicitly so you know where to change it later if you want
llm = OpenAI(temperature=0, model_name='text-davinci-003', openai_api_key=api_key)
# Create our template
template = """
%INSTRUCTIONS:
Please summarize the following piece of text.
Respond in a manner that a 5 year old would understand.
%TEXT:
{text}
"""
# Create a LangChain prompt template that we can insert values to later
prompt = PromptTemplate(
input_variables=["text"],
template=template,
)
confusing_text = """
For the next 130 years, debate raged.
Some scientists called Prototaxites a lichen, others a fungus, and still others clung to the notion that it was some kind of tree.
“The problem is that when you look up close at the anatomy, it’s evocative of a lot of different things, but it’s diagnostic of nothing,” says Boyce, an associate professor in geophysical sciences and the Committee on Evolutionary Biology.
“And it’s so damn big that when whenever someone says it’s something, everyone else’s hackles get up: ‘How could you have a lichen 20 feet tall?’”
"""
print ("------- Prompt Begin -------")
final_prompt = prompt.format(text=confusing_text)
print(final_prompt)
print ("------- Prompt End -------")
output = llm(final_prompt)
print (output)
def example_1_2():
print("=========== Summaries Of Longer Text ==================")
llm = OpenAI(temperature=0, openai_api_key=api_key)
with open('sampleText1.txt', 'r') as file:
text = file.read()
# Printing the first 285 characters as a preview
print (text[:285])
num_tokens = llm.get_num_tokens(text)
print (f"There are {num_tokens} tokens in your file")
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=5000, chunk_overlap=350)
docs = text_splitter.create_documents([text])
print (f"You now have {len(docs)} docs intead of 1 piece of text")
# Get your chain ready to use
chain = load_summarize_chain(llm=llm, chain_type='map_reduce') # verbose=True optional to see what is getting sent to the LLM
# Use it. This will run through the 4 documents, summarize the chunks, then get a summary of the summary.
output = chain.run(docs)
print (output)
#************************* 2. Question and Answer
def example2_1():
print("=========== Q A Example ==================")
llm = OpenAI(temperature=0, openai_api_key=api_key)
context = """
Rachel is 30 years old
Bob is 45 years old
Kevin is 65 years old
"""
question = "Who is under 40 years old?"
output = llm(context + question)
# I strip the text to remove the leading and trailing whitespace
print (output.strip())
question = "Who is the oldest?"
output = llm(context + question)
# I strip the text to remove the leading and trailing whitespace
print (output.strip())
def example2_2():
print("=========== Using Embeddings ==================")
llm = OpenAI(temperature=0, openai_api_key=api_key)
loader = TextLoader('sampleText2.txt')
doc = loader.load()
print (f"You have {len(doc)} document")
print (f"You have {len(doc[0].page_content)} characters in that document")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=400)
docs = text_splitter.split_documents(doc)
#Get the total number of characters so we can see the average later
num_total_characters = sum([len(x.page_content) for x in docs])
print (f"Now you have {len(docs)} documents that have an average of {num_total_characters / len(docs):,.0f} characters (smaller pieces)")
# Get your embeddings engine ready
embeddings = OpenAIEmbeddings(openai_api_key=api_key)
# Embed your documents and combine with the raw text in a pseudo db. Note: This will make an API call to OpenAI
docsearch = FAISS.from_documents(docs, embeddings)
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())
query = "What is the rare disease that affects most people in USA?"
a = qa.run(query)
print(a)
query = "What is the rare disease that most deadly?"
a = qa.run(query)
print(a)
query = "List all errors in the MDcharts.py"
a = qa.run(query)
print(a)
#************************* 3. Extraction
def example_3_1():
print("=========== Extraction: Vanilla Extraction ==================")
#llm = OpenAI(temperature=0, openai_api_key=api_key)
#loader = TextLoader('sampleText2.txt')
chat_model = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo', openai_api_key=api_key)
instructions = """
You will be given a sentence with fruit names, extract those fruit names and assign an emoji to them
Return the fruit name and emojis in a python dictionary
"""
fruit_names = """
Apple, Pear, this is an kiwi
"""
# Make your prompt which combines the instructions w/ the fruit names
prompt = (instructions + fruit_names)
# Call the LLM
output = chat_model([HumanMessage(content=prompt)])
print (output.content)
print (type(output.content))
output_dict = eval(output.content)
print (output_dict)
print (type(output_dict))
def example_3_2():
print("=========== Extraction: Using LangChain's Response Schema ==================")
chat_model = ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo', openai_api_key=api_key)
# The schema I want out
response_schemas = [
ResponseSchema(name="artist", description="The name of the musical artist"),
ResponseSchema(name="song" , description="The name of the song that the artist plays")
]
# The parser that will look for the LLM output in my schema and return it back to me
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
#The format instructions that LangChain makes. Let's look at them
format_instructions = output_parser.get_format_instructions()
print("format_instructions: ", format_instructions)
print("------------------------")
# The prompt template that brings it all together
# Note: This is a different prompt template than before because we are using a Chat Model
prompt = ChatPromptTemplate(
messages=[
HumanMessagePromptTemplate.from_template("Given a command from the user,\
extract the artist and song names \n \
{format_instructions}\n{user_prompt}")
],
input_variables=["user_prompt"],
partial_variables={"format_instructions": format_instructions}
)
fruit_query = prompt.format_prompt(user_prompt="I really like So Young by Portugal. The Man")
print (fruit_query.messages[0].content)
fruit_output = chat_model(fruit_query.to_messages())
output = output_parser.parse(fruit_output.content)
print (output)
print (type(output))
#************************* 4. Evaluation
#************************* 5. Query tabular data
#************************* 6. Code understanding
def example_6_1():
# TODO: fix window proxy bug with tiktoken
print("=========== Extraction: Using LangChain's Response Schema ==================")
# import tiktoken
# tiktoken.encoding_for_model('gpt-3.5-turbo')
llm = ChatOpenAI(model_name='gpt-3.5-turbo', openai_proxy=proxy, openai_api_key=api_key)
embeddings = OpenAIEmbeddings(disallowed_special=(), openai_proxy=proxy, openai_api_key=api_key)
#loader = TextLoader('sampleText2.txt')
#root_dir = 'synth-md'
root_dir = "../arxPipline/privacy-mgmt-anonymization"
docs = []
# Go through each folder
counter = 0
for dirpath, dirnames, filenames in os.walk(root_dir):
# Go through each file
for fnm in filenames:
#print(fnm)
try:
# Load up the file as a doc and split
counter += 1
filePath = os.path.join(dirpath, fnm)
#print(filePath)
loader = TextLoader(filePath, encoding='utf-8')
docs.extend(loader.load_and_split())
except Exception as e:
pass
print (f"You have {counter} files\n")
print (f"You have {len(docs)} documents\n")
print ("------ Start Document ------")
print (docs[0].page_content[:300])
print("------------ Code Analysis ------------------")
docsearch = FAISS.from_documents(docs, embeddings)
print("Get our retriever ready ..." )
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())
# query = "Convert synth-md python package to maven java package"
query = "can the json files be improved?"
output = qa.run(query)
print(output)
#************************* 7. Interaction with APIs
#************************* 8. Chatbots
#************************* 9. Agents
#************************* 10. FIN
#************************* 11. Generalization of attribute values
def example_11_1():
print("=========== Extraction: Generalization of attribute values ==================")
dataPath = "data/adult500.csv"
df = pd.read_csv(dataPath)
# opn = OpenAI(temperature=0, openai_api_key=api_key)
# agent = create_pandas_dataframe_agent(opn, df, verbose=True)
cht = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613",openai_proxy=proxy, openai_api_key=api_key)
#cht.proxy = "http://proxy.charite.de:8080"
agent = create_pandas_dataframe_agent(cht, df, verbose=False, gent_type=AgentType.OPENAI_FUNCTIONS,)
print("------------------------")
q1 = "how many rows are there?"
a1 = agent.run(q1)
print(a1, q1)
# # print("------------------------")
# # agent.run("whats the square root of the average age?")
# print("------------------------")
q2 = "What are the unique values of workclass attribute?"
a2 = agent.run(q2)
print(q2,a2)
print("------------------------")
q3 = "Suggest 3 levels of generalizations for the values of workclass\
attribute that can be used for k-anonymity?\
format the output as json"
a3 = agent.run(q3)
print(q3,a3)
print("------------------------")
# df1 = df.copy()
# df1["Age"] = df1["Age"].fillna(df1["Age"].mean())
# agent = create_pandas_dataframe_agent(OpenAI(temperature=0), [df, df1], verbose=True)
# agent.run("how many rows in the age column are different?")
print("xxxxxxxxxxxxxxxx LangChain Demo xxxxxxxxxxxxxxxx")
#example_1_1()
#example_1_2()
#example_2_1()
#example_2_1()
#example_3_1()
#example_3_2()
#example_6_1()
example_11_1()