add evaluation code&prompt

TianyuFan0504 · TianyuFan0504 · commit 5d520fc5bae0 · 2025-02-27T15:41:42.000+08:00
diff --git a/dataset/LiHua-World/evaluation.ipynb b/dataset/LiHua-World/evaluation.ipynb
@@ -0,0 +1,214 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import login\n",
+    "import os\n",
+    "import sys\n",
+    "import csv\n",
+    "from tqdm import trange\n",
+    "from transformers import AutoModel,AutoTokenizer\n",
+    "FILE_PATH = './QA_results_GT.csv'\n",
+    "os.environ[\"OPENAI_API_KEY\"] = AAA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ANA_FILE_PATH = './mthp_output.csv'\n",
+    "\n",
+    "naiveanswer_LIST = []\n",
+    "lightraganswer_LIST = []\n",
+    "minianswer_LIST = []\n",
+    "QUESTION_LIST = []\n",
+    "GA_LIST = []\n",
+    "filelength = 0\n",
+    "with open(ANA_FILE_PATH, mode='r', encoding='utf-8') as question_file:\n",
+    "    reader = csv.DictReader(question_file)\n",
+    "    for row in reader:\n",
+    "        QUESTION_LIST.append(row['Question'])\n",
+    "        GA_LIST.append(row['Gold Answer'])\n",
+    "        naiveanswer_LIST.append(row['naive'])\n",
+    "        lightraganswer_LIST.append(row['lightrag'])\n",
+    "        minianswer_LIST.append(row['minirag'])\n",
+    "        filelength = filelength+1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "PROMPT = \"\"\"\n",
+    "Now, I'll give you a question, a gold answer to this question, and three answers provided by different students.\n",
+    "\n",
+    "Determine the answer according to the following rules:\n",
+    "If the answer is correct, get 1 point.\n",
+    "If the answer is irrelevant to the question, it will receive 0 points.\n",
+    "If the answer is incorrect, get -1 point.\n",
+    "\n",
+    "Return your answer in JSON mode.\n",
+    "\n",
+    "For example:\n",
+    "\n",
+    "Question:\n",
+    "When does Li Hua arrive to the city?\n",
+    "\n",
+    "Gold Answer:\n",
+    "20260105\n",
+    "\n",
+    "Answer1: LiHua arrived on the afternoon of January 5th\n",
+    "Answer2: Sorry, there is no information about LiHua's arrival in the information you provided\n",
+    "Answer3: There is no accurate answer in the information you provided, but according to the first information found, LiHua arrived on April 17th\n",
+    "\n",
+    "output:\n",
+    "{{\n",
+    "\"Score1\": 1,\n",
+    "\"Score2\": 0,\n",
+    "\"Score3\": -1,\n",
+    "}}\n",
+    "\n",
+    "\n",
+    "\n",
+    "Real data:\n",
+    "\n",
+    "Question:\n",
+    "{question}\n",
+    "Gold Answer:\n",
+    "{ga}\n",
+    "\n",
+    "Answer1: {naive}\n",
+    "Answer2: {light}\n",
+    "Answer3: {mini}\n",
+    "\n",
+    "output:\n",
+    "\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#deepseek\n",
+    "from openai import OpenAI\n",
+    "chatbot = OpenAI(api_key=My_deepseek_key, base_url=\"https://api.deepseek.com\")\n",
+    "\n",
+    "chat_list = []\n",
+    "for i in range(filelength):\n",
+    "    p = PROMPT.format(question = QUESTION_LIST[i], ga = GA_LIST[i], naive = naiveanswer_LIST[i], light = lightraganswer_LIST[i], mini = minianswer_LIST[i])\n",
+    "    chat_completion = chatbot.chat.completions.create(\n",
+    "        messages=[\n",
+    "            {\n",
+    "                \"role\": \"system\",\n",
+    "                \"content\":p,\n",
+    "            },\n",
+    "            \n",
+    "\n",
+    "        ],\n",
+    "        model=\"deepseek-chat\",\n",
+    "        stream = False\n",
+    "    )\n",
+    "    chat_list.append(chat_completion.choices[0].message.content.strip())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#openai\n",
+    "from openai import OpenAI\n",
+    "from tqdm import trange\n",
+    "chatbot = OpenAI()\n",
+    "chat_list = []\n",
+    "for i in trange(filelength):\n",
+    "    p = PROMPT.format(question = QUESTION_LIST[i], ga = GA_LIST[i], naive = naiveanswer_LIST[i], light = lightraganswer_LIST[i], mini = minianswer_LIST[i])\n",
+    "    chat_completion = chatbot.chat.completions.create(\n",
+    "        messages=[\n",
+    "            {\n",
+    "                \"role\": \"system\",\n",
+    "                \"content\":p,\n",
+    "            },\n",
+    "        ],\n",
+    "        model=\"gpt-4o\",\n",
+    "    )\n",
+    "    chat_list.append(chat_completion.choices[0].message.content.strip())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import json_repair\n",
+    "chat_score_list = []    \n",
+    "for chat in chat_list:\n",
+    "    try:\n",
+    "        data = json_repair.loads(chat.strip('```json').strip('```'))\n",
+    "        chat_score_list.append(data)\n",
+    "    except:\n",
+    "        chat_score_list.append(0)\n",
+    "        print('Error in chat:', chat)\n",
+    "\n",
+    "all_score1 = [data['Score1'] for data in chat_score_list]\n",
+    "all_score2 = [data['Score2'] for data in chat_score_list]\n",
+    "all_score3 = [data['Score3'] for data in chat_score_list]\n",
+    "\n",
+    "all_score1_1 = all_score1.count(1)\n",
+    "all_score1_0 = all_score1.count(0)\n",
+    "all_score1_neg = all_score1.count(-1)\n",
+    "\n",
+    "all_score2_1 = all_score2.count(1)\n",
+    "all_score2_0 = all_score2.count(0)\n",
+    "all_score2_neg = all_score2.count(-1)\n",
+    "\n",
+    "all_score3_1 = all_score3.count(1)\n",
+    "all_score3_0 = all_score3.count(0)\n",
+    "all_score3_neg = all_score3.count(-1)\n",
+    "\n",
+    "all = len(all_score1)\n",
+    "print(all_score1_1, all_score1_0, all_score1_neg)\n",
+    "print(all_score2_1, all_score2_0, all_score2_neg)\n",
+    "print(all_score3_1, all_score3_0, all_score3_neg)\n",
+    "\n",
+    "print(f\"Score1 1: {all_score1_1 / all * 100:.2f}\\%, Score1 0: {all_score1_0 / all * 100:.2f}\\%, Score1 -1: {all_score1_neg / all * 100:.2f}\\%\")    \n",
+    "print(f\"Score2 1: {all_score2_1 / all * 100:.2f}\\%, Score2 0: {all_score2_0 / all * 100:.2f}\\%, Score2 -1: {all_score2_neg / all * 100:.2f}\\%\")\n",
+    "print(f\"Score3 1: {all_score3_1 / all * 100:.2f}\\%, Score3 0: {all_score3_0 / all * 100:.2f}\\%, Score3 -1: {all_score3_neg / all * 100:.2f}\\%\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Tianyu_agent",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.9.19"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}