Skip to content

Commit 5d520fc

Browse files
committed
add evaluation code&prompt
1 parent 908b1a4 commit 5d520fc

File tree

1 file changed

+214
-0
lines changed

1 file changed

+214
-0
lines changed

dataset/LiHua-World/evaluation.ipynb

Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"from huggingface_hub import login\n",
10+
"import os\n",
11+
"import sys\n",
12+
"import csv\n",
13+
"from tqdm import trange\n",
14+
"from transformers import AutoModel,AutoTokenizer\n",
15+
"FILE_PATH = './QA_results_GT.csv'\n",
16+
"os.environ[\"OPENAI_API_KEY\"] = AAA"
17+
]
18+
},
19+
{
20+
"cell_type": "code",
21+
"execution_count": null,
22+
"metadata": {},
23+
"outputs": [],
24+
"source": [
25+
"ANA_FILE_PATH = './mthp_output.csv'\n",
26+
"\n",
27+
"naiveanswer_LIST = []\n",
28+
"lightraganswer_LIST = []\n",
29+
"minianswer_LIST = []\n",
30+
"QUESTION_LIST = []\n",
31+
"GA_LIST = []\n",
32+
"filelength = 0\n",
33+
"with open(ANA_FILE_PATH, mode='r', encoding='utf-8') as question_file:\n",
34+
" reader = csv.DictReader(question_file)\n",
35+
" for row in reader:\n",
36+
" QUESTION_LIST.append(row['Question'])\n",
37+
" GA_LIST.append(row['Gold Answer'])\n",
38+
" naiveanswer_LIST.append(row['naive'])\n",
39+
" lightraganswer_LIST.append(row['lightrag'])\n",
40+
" minianswer_LIST.append(row['minirag'])\n",
41+
" filelength = filelength+1"
42+
]
43+
},
44+
{
45+
"cell_type": "code",
46+
"execution_count": null,
47+
"metadata": {},
48+
"outputs": [],
49+
"source": []
50+
},
51+
{
52+
"cell_type": "code",
53+
"execution_count": null,
54+
"metadata": {},
55+
"outputs": [],
56+
"source": [
57+
"PROMPT = \"\"\"\n",
58+
"Now, I'll give you a question, a gold answer to this question, and three answers provided by different students.\n",
59+
"\n",
60+
"Determine the answer according to the following rules:\n",
61+
"If the answer is correct, get 1 point.\n",
62+
"If the answer is irrelevant to the question, it will receive 0 points.\n",
63+
"If the answer is incorrect, get -1 point.\n",
64+
"\n",
65+
"Return your answer in JSON mode.\n",
66+
"\n",
67+
"For example:\n",
68+
"\n",
69+
"Question:\n",
70+
"When does Li Hua arrive to the city?\n",
71+
"\n",
72+
"Gold Answer:\n",
73+
"20260105\n",
74+
"\n",
75+
"Answer1: LiHua arrived on the afternoon of January 5th\n",
76+
"Answer2: Sorry, there is no information about LiHua's arrival in the information you provided\n",
77+
"Answer3: There is no accurate answer in the information you provided, but according to the first information found, LiHua arrived on April 17th\n",
78+
"\n",
79+
"output:\n",
80+
"{{\n",
81+
"\"Score1\": 1,\n",
82+
"\"Score2\": 0,\n",
83+
"\"Score3\": -1,\n",
84+
"}}\n",
85+
"\n",
86+
"\n",
87+
"\n",
88+
"Real data:\n",
89+
"\n",
90+
"Question:\n",
91+
"{question}\n",
92+
"Gold Answer:\n",
93+
"{ga}\n",
94+
"\n",
95+
"Answer1: {naive}\n",
96+
"Answer2: {light}\n",
97+
"Answer3: {mini}\n",
98+
"\n",
99+
"output:\n",
100+
"\n",
101+
"\"\"\""
102+
]
103+
},
104+
{
105+
"cell_type": "code",
106+
"execution_count": null,
107+
"metadata": {},
108+
"outputs": [],
109+
"source": [
110+
"#deepseek\n",
111+
"from openai import OpenAI\n",
112+
"chatbot = OpenAI(api_key=My_deepseek_key, base_url=\"https://api.deepseek.com\")\n",
113+
"\n",
114+
"chat_list = []\n",
115+
"for i in range(filelength):\n",
116+
" p = PROMPT.format(question = QUESTION_LIST[i], ga = GA_LIST[i], naive = naiveanswer_LIST[i], light = lightraganswer_LIST[i], mini = minianswer_LIST[i])\n",
117+
" chat_completion = chatbot.chat.completions.create(\n",
118+
" messages=[\n",
119+
" {\n",
120+
" \"role\": \"system\",\n",
121+
" \"content\":p,\n",
122+
" },\n",
123+
" \n",
124+
"\n",
125+
" ],\n",
126+
" model=\"deepseek-chat\",\n",
127+
" stream = False\n",
128+
" )\n",
129+
" chat_list.append(chat_completion.choices[0].message.content.strip())"
130+
]
131+
},
132+
{
133+
"cell_type": "code",
134+
"execution_count": null,
135+
"metadata": {},
136+
"outputs": [],
137+
"source": [
138+
"#openai\n",
139+
"from openai import OpenAI\n",
140+
"from tqdm import trange\n",
141+
"chatbot = OpenAI()\n",
142+
"chat_list = []\n",
143+
"for i in trange(filelength):\n",
144+
" p = PROMPT.format(question = QUESTION_LIST[i], ga = GA_LIST[i], naive = naiveanswer_LIST[i], light = lightraganswer_LIST[i], mini = minianswer_LIST[i])\n",
145+
" chat_completion = chatbot.chat.completions.create(\n",
146+
" messages=[\n",
147+
" {\n",
148+
" \"role\": \"system\",\n",
149+
" \"content\":p,\n",
150+
" },\n",
151+
" ],\n",
152+
" model=\"gpt-4o\",\n",
153+
" )\n",
154+
" chat_list.append(chat_completion.choices[0].message.content.strip())\n"
155+
]
156+
},
157+
{
158+
"cell_type": "code",
159+
"execution_count": null,
160+
"metadata": {},
161+
"outputs": [],
162+
"source": [
163+
"import json\n",
164+
"import json_repair\n",
165+
"chat_score_list = [] \n",
166+
"for chat in chat_list:\n",
167+
" try:\n",
168+
" data = json_repair.loads(chat.strip('```json').strip('```'))\n",
169+
" chat_score_list.append(data)\n",
170+
" except:\n",
171+
" chat_score_list.append(0)\n",
172+
" print('Error in chat:', chat)\n",
173+
"\n",
174+
"all_score1 = [data['Score1'] for data in chat_score_list]\n",
175+
"all_score2 = [data['Score2'] for data in chat_score_list]\n",
176+
"all_score3 = [data['Score3'] for data in chat_score_list]\n",
177+
"\n",
178+
"all_score1_1 = all_score1.count(1)\n",
179+
"all_score1_0 = all_score1.count(0)\n",
180+
"all_score1_neg = all_score1.count(-1)\n",
181+
"\n",
182+
"all_score2_1 = all_score2.count(1)\n",
183+
"all_score2_0 = all_score2.count(0)\n",
184+
"all_score2_neg = all_score2.count(-1)\n",
185+
"\n",
186+
"all_score3_1 = all_score3.count(1)\n",
187+
"all_score3_0 = all_score3.count(0)\n",
188+
"all_score3_neg = all_score3.count(-1)\n",
189+
"\n",
190+
"all = len(all_score1)\n",
191+
"print(all_score1_1, all_score1_0, all_score1_neg)\n",
192+
"print(all_score2_1, all_score2_0, all_score2_neg)\n",
193+
"print(all_score3_1, all_score3_0, all_score3_neg)\n",
194+
"\n",
195+
"print(f\"Score1 1: {all_score1_1 / all * 100:.2f}\\%, Score1 0: {all_score1_0 / all * 100:.2f}\\%, Score1 -1: {all_score1_neg / all * 100:.2f}\\%\") \n",
196+
"print(f\"Score2 1: {all_score2_1 / all * 100:.2f}\\%, Score2 0: {all_score2_0 / all * 100:.2f}\\%, Score2 -1: {all_score2_neg / all * 100:.2f}\\%\")\n",
197+
"print(f\"Score3 1: {all_score3_1 / all * 100:.2f}\\%, Score3 0: {all_score3_0 / all * 100:.2f}\\%, Score3 -1: {all_score3_neg / all * 100:.2f}\\%\")"
198+
]
199+
}
200+
],
201+
"metadata": {
202+
"kernelspec": {
203+
"display_name": "Tianyu_agent",
204+
"language": "python",
205+
"name": "python3"
206+
},
207+
"language_info": {
208+
"name": "python",
209+
"version": "3.9.19"
210+
}
211+
},
212+
"nbformat": 4,
213+
"nbformat_minor": 2
214+
}

0 commit comments

Comments
 (0)