._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n"]}], "source": ["graded_outputs = eval_chain.evaluate(examples, predictions)#\u5728\u6b64\u94fe\u4e0a\u8c03\u7528evaluate\uff0c\u8fdb\u884c\u8bc4\u4f30"]}, {"cell_type": "markdown", "id": "72dc8595", "metadata": {}, "source": ["#### \u8bc4\u4f30\u601d\u8def\n", "\u5f53\u5b83\u9762\u524d\u6709\u6574\u4e2a\u6587\u6863\u65f6\uff0c\u5b83\u53ef\u4ee5\u751f\u6210\u4e00\u4e2a\u771f\u5b9e\u7684\u7b54\u6848\uff0c\u6211\u4eec\u5c06\u6253\u5370\u51fa\u9884\u6d4b\u7684\u7b54\uff0c\u5f53\u5b83\u8fdb\u884cQA\u94fe\u65f6\uff0c\u4f7f\u7528embedding\u548c\u5411\u91cf\u6570\u636e\u5e93\u8fdb\u884c\u68c0\u7d22\u65f6\uff0c\u5c06\u5176\u4f20\u9012\u5230\u8bed\u8a00\u6a21\u578b\u4e2d\uff0c\u7136\u540e\u5c1d\u8bd5\u731c\u6d4b\u9884\u6d4b\u7684\u7b54\u6848\uff0c\u6211\u4eec\u8fd8\u5c06\u6253\u5370\u51fa\u6210\u7ee9\uff0c\u8fd9\u4e5f\u662f\u8bed\u8a00\u6a21\u578b\u751f\u6210\u7684\u3002\u5f53\u5b83\u8981\u6c42\u8bc4\u4f30\u94fe\u8bc4\u4f30\u6b63\u5728\u53d1\u751f\u7684\u4e8b\u60c5\u65f6\uff0c\u4ee5\u53ca\u5b83\u662f\u5426\u6b63\u786e\u6216\u4e0d\u6b63\u786e\u3002\u56e0\u6b64\uff0c\u5f53\u6211\u4eec\u5faa\u73af\u904d\u5386\u6240\u6709\u8fd9\u4e9b\u793a\u4f8b\u5e76\u5c06\u5b83\u4eec\u6253\u5370\u51fa\u6765\u65f6\uff0c\u53ef\u4ee5\u8be6\u7ec6\u4e86\u89e3\u6bcf\u4e2a\u793a\u4f8b"]}, {"cell_type": "code", "execution_count": 54, "id": "bf21e40a", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["Example 0:\n", "Question: \u9ad8\u6e05\u7535\u89c6\u673a\u600e\u4e48\u8fdb\u884c\u62a4\u7406\uff1f\n", "Real Answer: \u4f7f\u7528\u5e72\u5e03\u6e05\u6d01\u3002\n", "Predicted Answer: \u9ad8\u6e05\u7535\u89c6\u673a\u7684\u62a4\u7406\u975e\u5e38\u7b80\u5355\u3002\u60a8\u53ea\u9700\u8981\u4f7f\u7528\u5e72\u5e03\u6e05\u6d01\u5373\u53ef\u3002\u907f\u514d\u4f7f\u7528\u6e7f\u5e03\u6216\u5316\u5b66\u6e05\u6d01\u5242\uff0c\u4ee5\u514d\u635f\u574f\u7535\u89c6\u673a\u7684\u8868\u9762\u3002\n", "Predicted Grade: CORRECT\n", "\n", "Example 1:\n", "Question: \u65c5\u884c\u80cc\u5305\u6709\u5185\u5916\u888b\u5417\uff1f\n", "Real Answer: \u6709\u3002\n", "Predicted Answer: \u662f\u7684\uff0c\u65c5\u884c\u80cc\u5305\u6709\u591a\u4e2a\u5b9e\u7528\u7684\u5185\u5916\u888b\uff0c\u53ef\u4ee5\u8f7b\u677e\u88c5\u4e0b\u60a8\u7684\u5fc5\u9700\u54c1\u3002\n", "Predicted Grade: CORRECT\n", "\n", "Example 2:\n", "Question: \u8fd9\u6b3e\u5168\u81ea\u52a8\u5496\u5561\u673a\u6709\u4ec0\u4e48\u7279\u70b9\u548c\u4f18\u52bf\uff1f\n", "Real Answer: \u8fd9\u6b3e\u5168\u81ea\u52a8\u5496\u5561\u673a\u7684\u7279\u70b9\u548c\u4f18\u52bf\u5305\u62ec\u4e00\u952e\u64cd\u4f5c\u3001\u5185\u7f6e\u7814\u78e8\u5668\u548c\u6ee4\u7f51\u3001\u9884\u8bbe\u591a\u79cd\u5496\u5561\u6a21\u5f0f\u3001\u8010\u7528\u6027\u548c\u4e00\u81f4\u6027\uff0c\u4ee5\u53ca\u7531\u9ad8\u54c1\u8d28\u4e0d\u9508\u94a2\u5236\u6210\u7684\u6784\u9020\u3002\n", "Predicted Answer: \u8fd9\u6b3e\u5168\u81ea\u52a8\u5496\u5561\u673a\u6709\u4ee5\u4e0b\u7279\u70b9\u548c\u4f18\u52bf\uff1a\n", "1. \u4e00\u952e\u64cd\u4f5c\uff1a\u53ea\u9700\u6309\u4e0b\u6309\u94ae\uff0c\u5373\u53ef\u7814\u78e8\u5496\u5561\u8c46\u5e76\u6c8f\u5236\u51fa\u60a8\u559c\u7231\u7684\u5496\u5561\uff0c\u975e\u5e38\u65b9\u4fbf\u3002\n", "2. \u8010\u7528\u6027\u548c\u4e00\u81f4\u6027\uff1a\u8fd9\u6b3e\u5496\u5561\u673a\u5177\u6709\u8010\u7528\u6027\u548c\u4e00\u81f4\u6027\uff0c\u4f7f\u5176\u6210\u4e3a\u5bb6\u5ead\u548c\u529e\u516c\u5ba4\u7684\u7406\u60f3\u9009\u62e9\u3002\n", "3. \u5185\u7f6e\u7814\u78e8\u5668\u548c\u6ee4\u7f51\uff1a\u5496\u5561\u673a\u5185\u7f6e\u7814\u78e8\u5668\u548c\u6ee4\u7f51\uff0c\u53ef\u4ee5\u786e\u4fdd\u5496\u5561\u7684\u65b0\u9c9c\u548c\u53e3\u611f\u3002\n", "4. \u591a\u79cd\u5496\u5561\u6a21\u5f0f\uff1a\u5496\u5561\u673a\u9884\u8bbe\u4e86\u591a\u79cd\u5496\u5561\u6a21\u5f0f\uff0c\u53ef\u4ee5\u6839\u636e\u4e2a\u4eba\u53e3\u5473\u9009\u62e9\u4e0d\u540c\u7684\u5496\u5561\u3002\n", "5. \u9ad8\u54c1\u8d28\u6750\u6599\uff1a\u5496\u5561\u673a\u7531\u9ad8\u54c1\u8d28\u4e0d\u9508\u94a2\u5236\u6210\uff0c\u5177\u6709\u4f18\u826f\u7684\u8010\u7528\u6027\u548c\u8d28\u611f\u3002\n", "6. \u4e2d\u56fd\u5236\u9020\uff1a\u8fd9\u6b3e\u5496\u5561\u673a\u662f\u5728\u4e2d\u56fd\u5236\u9020\u7684\uff0c\u5177\u6709\u53ef\u9760\u7684\u54c1\u8d28\u4fdd\u8bc1\u3002\n", "Predicted Grade: CORRECT\n", "\n", "Example 3:\n", "Question: \u8fd9\u6b3e\u7535\u52a8\u7259\u5237\u7684\u89c4\u683c\u662f\u4ec0\u4e48\uff1f\n", "Real Answer: \u8fd9\u6b3e\u7535\u52a8\u7259\u5237\u7684\u89c4\u683c\u662f\u4e00\u822c\u5927\u5c0f\uff0c\u9ad8\u5ea6\u4e3a9.5\u82f1\u5bf8\uff0c\u5bbd\u5ea6\u4e3a1\u82f1\u5bf8\u3002\n", "Predicted Answer: \u8fd9\u6b3e\u7535\u52a8\u7259\u5237\u7684\u89c4\u683c\u662f\uff1a\u9ad8\u5ea6\u4e3a9.5\u82f1\u5bf8\uff0c\u5bbd\u5ea6\u4e3a1\u82f1\u5bf8\u3002\n", "Predicted Grade: CORRECT\n", "\n", "Example 4:\n", "Question: \u8fd9\u6b3e\u6a59\u5473\u7ef4\u751f\u7d20C\u6ce1\u817e\u7247\u7684\u89c4\u683c\u662f\u4ec0\u4e48\uff1f\n", "Real Answer: \u8fd9\u6b3e\u6a59\u5473\u7ef4\u751f\u7d20C\u6ce1\u817e\u7247\u6bcf\u76d2\u542b\u670920\u7247\u3002\n", "Predicted Answer: \u8fd9\u6b3e\u6a59\u5473\u7ef4\u751f\u7d20C\u6ce1\u817e\u7247\u7684\u89c4\u683c\u662f\u6bcf\u76d2\u542b\u670920\u7247\u3002\n", "Predicted Grade: CORRECT\n", "\n", "Example 5:\n", "Question: \u8fd9\u6b3e\u65e0\u7ebf\u84dd\u7259\u8033\u673a\u7684\u89c4\u683c\u662f\u4ec0\u4e48\uff1f\n", "Real Answer: \u5355\u4e2a\u8033\u673a\u5c3a\u5bf8\u4e3a1.5'' x 1.3''\u3002\n", "Predicted Answer: \u8fd9\u6b3e\u65e0\u7ebf\u84dd\u7259\u8033\u673a\u7684\u89c4\u683c\u662f\u5355\u4e2a\u8033\u673a\u5c3a\u5bf8\u4e3a1.5'' x 1.3''\u3002\n", "Predicted Grade: CORRECT\n", "\n", "Example 6:\n", "Question: \u8fd9\u4e2a\u4ea7\u54c1\u7684\u540d\u79f0\u662f\u4ec0\u4e48\uff1f\n", "Real Answer: \u745c\u4f3d\u57ab\n", "Predicted Answer: \u8fd9\u4e2a\u4ea7\u54c1\u7684\u540d\u79f0\u662f\u513f\u7ae5\u76ca\u667a\u73a9\u5177\u3002\n", "Predicted Grade: INCORRECT\n", "\n", "Example 7:\n", "Question: \u8fd9\u6b3e\u5168\u81ea\u52a8\u5496\u5561\u673a\u7684\u89c4\u683c\u662f\u4ec0\u4e48\uff1f\n", "Real Answer: \u5927\u578b\u5c3a\u5bf8\u4e3a13.8'' x 17.3''\uff0c\u4e2d\u578b\u5c3a\u5bf8\u4e3a11.5'' x 15.2''\u3002\n", "Predicted Answer: \u8fd9\u6b3e\u5168\u81ea\u52a8\u5496\u5561\u673a\u6709\u4e24\u79cd\u89c4\u683c\uff1a\n", "- \u5927\u578b\u5c3a\u5bf8\u4e3a13.8'' x 17.3''\u3002\n", "- \u4e2d\u578b\u5c3a\u5bf8\u4e3a11.5'' x 15.2''\u3002\n", "Predicted Grade: CORRECT\n", "\n", "Example 8:\n", "Question: \u8fd9\u6b3e\u7535\u52a8\u7259\u5237\u7684\u9ad8\u5ea6\u548c\u5bbd\u5ea6\u5206\u522b\u662f\u591a\u5c11\uff1f\n", "Real Answer: \u8fd9\u6b3e\u7535\u52a8\u7259\u5237\u7684\u9ad8\u5ea6\u662f9.5\u82f1\u5bf8\uff0c\u5bbd\u5ea6\u662f1\u82f1\u5bf8\u3002\n", "Predicted Answer: \u8fd9\u6b3e\u7535\u52a8\u7259\u5237\u7684\u9ad8\u5ea6\u662f9.5\u82f1\u5bf8\uff0c\u5bbd\u5ea6\u662f1\u82f1\u5bf8\u3002\n", "Predicted Grade: CORRECT\n", "\n", "Example 9:\n", "Question: \u8fd9\u6b3e\u6a59\u5473\u7ef4\u751f\u7d20C\u6ce1\u817e\u7247\u7684\u89c4\u683c\u662f\u4ec0\u4e48\uff1f\n", "Real Answer: \u6bcf\u76d2\u542b\u670920\u7247\u3002\n", "Predicted Answer: \u8fd9\u6b3e\u6a59\u5473\u7ef4\u751f\u7d20C\u6ce1\u817e\u7247\u7684\u89c4\u683c\u662f\u6bcf\u76d2\u542b\u670920\u7247\u3002\n", "Predicted Grade: CORRECT\n", "\n", "Example 10:\n", "Question: \u8fd9\u6b3e\u65e0\u7ebf\u84dd\u7259\u8033\u673a\u7684\u5c3a\u5bf8\u662f\u591a\u5c11\uff1f\n", "Real Answer: \u8fd9\u6b3e\u65e0\u7ebf\u84dd\u7259\u8033\u673a\u7684\u5c3a\u5bf8\u662f1.5'' x 1.3''\u3002\n", "Predicted Answer: \u8fd9\u6b3e\u65e0\u7ebf\u84dd\u7259\u8033\u673a\u7684\u5c3a\u5bf8\u662f1.5'' x 1.3''\u3002\n", "Predicted Grade: CORRECT\n", "\n", "Example 11:\n", "Question: \u8fd9\u6b3e\u745c\u4f3d\u57ab\u7684\u5c3a\u5bf8\u662f\u591a\u5c11\uff1f\n", "Real Answer: \u8fd9\u6b3e\u745c\u4f3d\u57ab\u7684\u5c3a\u5bf8\u662f24'' x 68''\u3002\n", "Predicted Answer: \u8fd9\u6b3e\u745c\u4f3d\u57ab\u7684\u5c3a\u5bf8\u662f24'' x 68''\u3002\n", "Predicted Grade: CORRECT\n", "\n"]}], "source": ["#\u6211\u4eec\u5c06\u4f20\u5165\u793a\u4f8b\u548c\u9884\u6d4b\uff0c\u5f97\u5230\u4e00\u5806\u5206\u7ea7\u8f93\u51fa\uff0c\u5faa\u73af\u904d\u5386\u5b83\u4eec\u6253\u5370\u7b54\u6848\n", "for i, eg in enumerate(examples):\n", " print(f\"Example {i}:\")\n", " print(\"Question: \" + predictions[i]['query'])\n", " print(\"Real Answer: \" + predictions[i]['answer'])\n", " print(\"Predicted Answer: \" + predictions[i]['result'])\n", " print(\"Predicted Grade: \" + graded_outputs[i]['text'])\n", " print()"]}, {"cell_type": "markdown", "id": "ece7c7b4", "metadata": {}, "source": ["#### \u7ed3\u679c\u5206\u6790\n", "\u5bf9\u4e8e\u6bcf\u4e2a\u793a\u4f8b\uff0c\u5b83\u770b\u8d77\u6765\u90fd\u662f\u6b63\u786e\u7684\uff0c\u8ba9\u6211\u4eec\u770b\u770b\u7b2c\u4e00\u4e2a\u4f8b\u5b50\u3002\n", "\u8fd9\u91cc\u7684\u95ee\u9898\u662f\uff0c\u65c5\u884c\u80cc\u5305\u6709\u5185\u5916\u888b\u5417\uff1f\u771f\u6b63\u7684\u7b54\u6848\uff0c\u6211\u4eec\u521b\u5efa\u4e86\u8fd9\u4e2a\uff0c\u662f\u80af\u5b9a\u7684\u3002\u6a21\u578b\u9884\u6d4b\u7684\u7b54\u6848\u662f\u662f\u7684\uff0c\u65c5\u884c\u80cc\u5305\u6709\u591a\u4e2a\u5b9e\u7528\u7684\u5185\u5916\u888b\uff0c\u53ef\u4ee5\u8f7b\u677e\u88c5\u4e0b\u60a8\u7684\u5fc5\u9700\u54c1\u3002\u56e0\u6b64\uff0c\u6211\u4eec\u53ef\u4ee5\u7406\u89e3\u8fd9\u662f\u4e00\u4e2a\u6b63\u786e\u7684\u7b54\u6848\u3002\u5b83\u5c06\u5176\u8bc4\u4e3a\u6b63\u786e\u3002 \n", "#### \u4f7f\u7528\u6a21\u578b\u8bc4\u4f30\u7684\u4f18\u52bf\n", "\n", "\u4f60\u6709\u8fd9\u4e9b\u7b54\u6848\uff0c\u5b83\u4eec\u662f\u4efb\u610f\u7684\u5b57\u7b26\u4e32\u3002\u6ca1\u6709\u5355\u4e00\u7684\u771f\u5b9e\u5b57\u7b26\u4e32\u662f\u6700\u597d\u7684\u53ef\u80fd\u7b54\u6848\uff0c\u6709\u8bb8\u591a\u4e0d\u540c\u7684\u53d8\u4f53\uff0c\u53ea\u8981\u5b83\u4eec\u5177\u6709\u76f8\u540c\u7684\u8bed\u4e49\uff0c\u5b83\u4eec\u5e94\u8be5\u88ab\u8bc4\u4e3a\u76f8\u4f3c\u3002\u5982\u679c\u4f7f\u7528\u6b63\u5219\u8fdb\u884c\u7cbe\u51c6\u5339\u914d\u5c31\u4f1a\u4e22\u5931\u8bed\u4e49\u4fe1\u606f\uff0c\u5230\u76ee\u524d\u4e3a\u6b62\u5b58\u5728\u7684\u8bb8\u591a\u8bc4\u4f30\u6307\u6807\u90fd\u4e0d\u591f\u597d\u3002\u76ee\u524d\u6700\u6709\u8da3\u548c\u6700\u53d7\u6b22\u8fce\u7684\u4e4b\u4e00\u5c31\u662f\u4f7f\u7528\u8bed\u8a00\u6a21\u578b\u8fdb\u884c\u8bc4\u4f30\u3002"]}], "metadata": {"kernelspec": {"display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12"}, "toc": {"base_numbering": 1, "nav_menu": {}, "number_sections": false, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {"height": "calc(100% - 180px)", "left": "10px", "top": "150px", "width": "261.818px"}, "toc_section_display": true, "toc_window_display": true}}, "nbformat": 4, "nbformat_minor": 5}
\ No newline at end of file
diff --git "a/content/\345\277\205\344\277\256\344\270\211-LangChain for LLM Application Development/7.\344\273\243\347\220\206 Agent.ipynb" "b/content/\345\277\205\344\277\256\344\270\211-LangChain for LLM Application Development/7.\344\273\243\347\220\206 Agent.ipynb"
index 476235dc..8b107496 100644
--- "a/content/\345\277\205\344\277\256\344\270\211-LangChain for LLM Application Development/7.\344\273\243\347\220\206 Agent.ipynb"
+++ "b/content/\345\277\205\344\277\256\344\270\211-LangChain for LLM Application Development/7.\344\273\243\347\220\206 Agent.ipynb"
@@ -1 +1 @@
-{"cells":[{"cell_type":"markdown","id":"2caa79ba-45e3-437c-9cb6-e433f443f0bf","metadata":{},"source":["# 第七章 代理\n","\n"," - [一、设置OpenAI API Key](#一、设置OpenAI-API-Key)\n"," - [二、LangChain内置工具](#二、LangChain内置工具)\n"," - [2.1 使用llm-math和wikipedia工具](#2.1-使用llm-math和wikipedia工具)\n"," - [2.2 使用PythonREPLTool工具](#2.2-使用PythonREPLTool工具)\n"," - [三、 定义自己的工具并在代理中使用](#三、-定义自己的工具并在代理中使用)\n"," - [3.1 创建和使用自定义时间工具](#3.1-创建和使用自定义时间工具)\n"]},{"cell_type":"markdown","id":"8a31726a-82ac-46e4-a51e-7747fc4c6aed","metadata":{"tags":[]},"source":["## 一、设置OpenAI API Key\n","\n","登陆 [OpenAI 账户](https://platform.openai.com/account/api-keys) 获取API Key,然后将其设置为环境变量。\n","\n","- 如果你想要设置为全局环境变量,可以参考[知乎文章](https://zhuanlan.zhihu.com/p/627665725)。\n","- 如果你想要设置为本地/项目环境变量,在本文件目录下创建`.env`文件, 打开文件输入以下内容。\n","\n"," \n"," OPENAI_API_KEY=\"your_api_key\" \n","
\n"," \n"," 替换\"your_api_key\"为你自己的 API Key"]},{"cell_type":"code","execution_count":1,"id":"312f4c23-fe27-4cda-a645-8e1519c2febf","metadata":{"tags":[]},"outputs":[],"source":["import os\n","import openai\n","from dotenv import load_dotenv, find_dotenv\n","\n","# 读取本地/项目的环境变量。\n","\n","# find_dotenv()寻找并定位.env文件的路径\n","# load_dotenv()读取该.env文件,并将其中的环境变量加载到当前的运行环境中 \n","# 如果你设置的是全局的环境变量,这行代码则没有任何作用。\n","_ = load_dotenv(find_dotenv())\n","\n","# 获取环境变量 OPENAI_API_KEY\n","openai.api_key = os.environ['OPENAI_API_KEY'] "]},{"cell_type":"markdown","id":"631c764b-68fa-483a-80a5-9a322cd1117c","metadata":{},"source":["## 二、LangChain内置工具"]},{"cell_type":"code","execution_count":2,"id":"bc80a8a7-a66e-4fbb-8d57-05004a3e77c7","metadata":{},"outputs":[],"source":["from langchain.agents import load_tools, initialize_agent\n","from langchain.agents import AgentType\n","from langchain.python import PythonREPL\n","from langchain.chat_models.openai import ChatOpenAI"]},{"cell_type":"markdown","id":"d5a3655c-4d5e-4a86-8bf4-a11bd1525059","metadata":{},"source":["### 2.1 使用llm-math和wikipedia工具"]},{"cell_type":"markdown","id":"e484d285-2110-4a06-a360-55313d6d9ffc","metadata":{},"source":["#### 1️⃣ 初始化大语言模型\n","\n","- 默认密钥`openai_api_key`为环境变量`OPENAI_API_KEY`。因此在运行以下代码之前,确保你已经设置环境变量`OPENAI_API_KEY`。如果还没有密钥,请[获取你的API Key](https://platform.openai.com/account/api-keys) 。\n","- 默认模型`model_name`为`gpt-3.5-turbo`。\n","- 更多关于模型默认参数请查看[这里](https://github.com/hwchase17/langchain/blob/master/langchain/chat_models/openai.py)。"]},{"cell_type":"code","execution_count":3,"id":"71e5894f-fc50-478b-a3d3-58891cc46ffc","metadata":{},"outputs":[{"name":"stderr","output_type":"stream","text":["/Users/lta/anaconda3/envs/cookbook/lib/python3.10/site-packages/langchain_core/_api/deprecation.py:119: LangChainDeprecationWarning: The class `ChatOpenAI` was deprecated in LangChain 0.0.10 and will be removed in 0.3.0. An updated version of the class exists in the langchain-openai package and should be used instead. To use it run `pip install -U langchain-openai` and import as `from langchain_openai import ChatOpenAI`.\n"," warn_deprecated(\n"]}],"source":["# 参数temperature设置为0.0,从而减少生成答案的随机性。\n","llm = ChatOpenAI(temperature=0)"]},{"cell_type":"markdown","id":"d7f75023-1825-4d74-bc8e-2c362f551fd1","metadata":{"tags":[]},"source":["#### 2️⃣ 加载工具包\n","- `llm-math` 工具结合语言模型和计算器用以进行数学计算\n","- `wikipedia`工具通过API连接到wikipedia进行搜索查询。"]},{"cell_type":"code","execution_count":4,"id":"3541b866-8a35-4b6f-bfe6-66edb1d666cd","metadata":{},"outputs":[],"source":["tools = load_tools(\n"," [\"llm-math\",\"wikipedia\"], \n"," llm=llm #第一步初始化的模型\n",")"]},{"cell_type":"markdown","id":"e5b4fcc8-8817-4a94-b154-3f328480e441","metadata":{},"source":["#### 3️⃣ 初始化代理\n","\n","- `agent`: 代理类型。这里使用的是`AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION`。其中`CHAT`代表代理模型为针对对话优化的模型,`REACT`代表针对REACT设计的提示模版。\n","- `handle_parsing_errors`: 是否处理解析错误。当发生解析错误时,将错误信息返回给大模型,让其进行纠正。\n","- `verbose`: 是否输出中间步骤结果。"]},{"cell_type":"code","execution_count":5,"id":"e7cdc782-2b8c-4649-ae1e-cabacee1a757","metadata":{},"outputs":[{"name":"stderr","output_type":"stream","text":["/Users/lta/anaconda3/envs/cookbook/lib/python3.10/site-packages/langchain_core/_api/deprecation.py:119: LangChainDeprecationWarning: The function `initialize_agent` was deprecated in LangChain 0.1.0 and will be removed in 0.3.0. Use Use new agent constructor methods like create_react_agent, create_json_agent, create_structured_chat_agent, etc. instead.\n"," warn_deprecated(\n"]}],"source":["agent= initialize_agent(\n"," tools, #第二步加载的工具\n"," llm, #第一步初始化的模型\n"," agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION, #代理类型\n"," handle_parsing_errors=True, #处理解析错误\n"," verbose = True #输出中间步骤\n",")"]},{"cell_type":"markdown","id":"b980137a-a1d2-4c19-80c8-ec380f9c1efe","metadata":{"tags":[]},"source":["#### 4️⃣.1️⃣ 使用代理回答数学问题"]},{"cell_type":"code","execution_count":6,"id":"1e2bcd3b-ae72-4fa1-b38e-d0ce650e1f31","metadata":{},"outputs":[{"name":"stderr","output_type":"stream","text":["/Users/lta/anaconda3/envs/cookbook/lib/python3.10/site-packages/langchain_core/_api/deprecation.py:119: LangChainDeprecationWarning: The method `Chain.__call__` was deprecated in langchain 0.1.0 and will be removed in 0.3.0. Use invoke instead.\n"," warn_deprecated(\n"]},{"name":"stdout","output_type":"stream","text":["\n","\n","\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n","\u001b[32;1m\u001b[1;3mQuestion: 计算300的25%,思考过程请使用中文。\n","Thought: 首先,我们知道25%可以表示为0.25,然后将300乘以0.25即可得到答案。\n","Action:\n","```\n","{\n"," \"action\": \"Calculator\",\n"," \"action_input\": \"300*0.25\"\n","}\n","```\n","\u001b[0m\n","Observation: \u001b[36;1m\u001b[1;3mAnswer: 75.0\u001b[0m\n","Thought:\u001b[32;1m\u001b[1;3mFinal Answer: 75.0\u001b[0m\n","\n","\u001b[1m> Finished chain.\u001b[0m\n"]},{"data":{"text/plain":["{'input': '计算300的25%,思考过程请使用中文。', 'output': '75.0'}"]},"execution_count":6,"metadata":{},"output_type":"execute_result"}],"source":["agent(\"计算300的25%,思考过程请使用中文。\") "]},{"cell_type":"markdown","id":"05bb0811-d71e-4016-868b-efbb651d8e59","metadata":{},"source":["✅ **总结**\n","\n","1. 模型对于接下来需要做什么,给出思考(Thought) \n"," \n"," 思考:我们需要计算300的25%,这个过程中需要用到乘法和除法。
\n","\n","2. 模型基于思考采取行动(Action)\n"," 行动: 使用计算器(calculator),输入300*0.25
\n","3. 模型得到观察(Observation)\n"," 观察:答案: 75.0
\n","\n","4. 基于观察,模型对于接下来需要做什么,给出思考(Thought)\n"," 思考: 我们的问题有了答案
\n","\n","5. 给出最终答案(Final Answer)\n"," 最终答案: 75.0
\n","5. 以字典的形式给出最终答案。"]},{"cell_type":"markdown","id":"bdd13d06-0f0e-4918-ac28-339f524f8c76","metadata":{},"source":["#### 4️⃣.2️⃣ Tom M. Mitchell的书"]},{"cell_type":"code","execution_count":7,"id":"f43b5cbf-1011-491b-90e8-09cf903fb866","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["\n","\n","\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n","\u001b[32;1m\u001b[1;3mThought: I can use Wikipedia to find out which book Tom M. Mitchell wrote.\n","Action:\n","```\n","{\n"," \"action\": \"wikipedia\",\n"," \"action_input\": \"Tom M. Mitchell\"\n","}\n","```\u001b[0m"]},{"name":"stderr","output_type":"stream","text":["/Users/lta/anaconda3/envs/cookbook/lib/python3.10/site-packages/wikipedia/wikipedia.py:389: GuessedAtParserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"html.parser\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n","\n","The code that caused this warning is on line 389 of the file /Users/lta/anaconda3/envs/cookbook/lib/python3.10/site-packages/wikipedia/wikipedia.py. To get rid of this warning, pass the additional argument 'features=\"html.parser\"' to the BeautifulSoup constructor.\n","\n"," lis = BeautifulSoup(html).find_all('li')\n"]},{"name":"stdout","output_type":"stream","text":["\n","Observation: \u001b[33;1m\u001b[1;3mPage: Tom M. Mitchell\n","Summary: Tom Michael Mitchell (born August 9, 1951) is an American computer scientist and the Founders University Professor at Carnegie Mellon University (CMU). He is a founder and former Chair of the Machine Learning Department at CMU. Mitchell is known for his contributions to the advancement of machine learning, artificial intelligence, and cognitive neuroscience and is the author of the textbook Machine Learning. He is a member of the United States National Academy of Engineering since 2010. He is also a Fellow of the American Academy of Arts and Sciences, the American Association for the Advancement of Science and a Fellow and past President of the Association for the Advancement of Artificial Intelligence. In October 2018, Mitchell was appointed as the Interim Dean of the School of Computer Science at Carnegie Mellon.\n","\n","Page: Tom Mitchell (Australian footballer)\n","Summary: Thomas Mitchell (born 31 May 1993) is a professional Australian rules footballer playing for the Collingwood Football Club in the Australian Football League (AFL). He previously played for the Sydney Swans from 2012 to 2016, and the Hawthorn Football Club between 2017 and 2022. Mitchell won the Brownlow Medal as the league's best and fairest player in 2018 and set the record for the most disposals in a VFL/AFL match, accruing 54 in a game against Collingwood during that season. He would later join them in 2023, en route to winning the 2023 AFL Grand Final and his first AFL premiership.\u001b[0m\n","Thought:\u001b[32;1m\u001b[1;3mI have found that Tom M. Mitchell wrote the textbook \"Machine Learning.\"\n","Final Answer: Machine Learning\u001b[0m\n","\n","\u001b[1m> Finished chain.\u001b[0m\n"]},{"data":{"text/plain":["{'input': 'Tom M. Mitchell is an American computer scientist and the Founders University Professor at Carnegie Mellon University (CMU)what book did he write?',\n"," 'output': 'Machine Learning'}"]},"execution_count":7,"metadata":{},"output_type":"execute_result"}],"source":["question = \"Tom M. Mitchell is an American computer scientist \\\n","and the Founders University Professor at Carnegie Mellon University (CMU)\\\n","what book did he write?\"\n","agent(question) "]},{"cell_type":"markdown","id":"39e2f6fb-f229-4235-ad58-98a5f505800c","metadata":{},"source":["✅ **总结**\n","\n","1. 模型对于接下来需要做什么,给出思考(Thought) \n"," 思考:我应该使用维基百科去搜索。
\n","\n","2. 模型基于思考采取行动(Action)\n"," 行动: 使用维基百科,输入Tom M. Mitchell
\n","3. 模型得到观察(Observation)\n"," 观测: 页面: Tom M. Mitchell,页面: Tom Mitchell (澳大利亚足球运动员)
\n","\n","4. 基于观察,模型对于接下来需要做什么,给出思考(Thought)\n"," 思考: Tom M. Mitchell写的书是Machine Learning
\n","\n","5. 给出最终答案(Final Answer)\n"," 最终答案: Machine Learning
\n","5. 以字典的形式给出最终答案。\n","\n","\n","值得注意的是,模型每次运行推理的过程可能存在差异,但最终的结果一致。"]},{"cell_type":"markdown","id":"5901cab6-a7c9-4590-b35d-d41c29e39a39","metadata":{},"source":["### 2.2 使用PythonREPLTool工具"]},{"cell_type":"markdown","id":"12b8d837-524c-46b8-9189-504808cf1f93","metadata":{},"source":["#### 1️⃣ 创建pyhon代理"]},{"cell_type":"code","execution_count":8,"id":"6c60d933-168d-4834-b29c-b2b43ade80cc","metadata":{},"outputs":[],"source":["from langchain_experimental.agents.agent_toolkits.python.base import create_python_agent\n","from langchain_experimental.tools.python.tool import PythonREPLTool\n","\n","agent = create_python_agent(\n"," llm, #使用前面一节已经加载的大语言模型\n"," tool=PythonREPLTool(), #使用Python交互式环境工具(REPLTool)\n"," verbose=True #输出中间步骤\n",")"]},{"cell_type":"markdown","id":"d32ed40c-cbcd-4efd-b044-57e611f5fab5","metadata":{"tags":[]},"source":["#### 2️⃣ 使用代理对顾客名字进行排序"]},{"cell_type":"code","execution_count":9,"id":"3d416f28-8fdb-4761-a4f9-d46c6d37fb3d","metadata":{},"outputs":[],"source":["customer_list = [\"小明\",\"小黄\",\"小红\",\"小蓝\",\"小橘\",\"小绿\",]"]},{"cell_type":"code","execution_count":10,"id":"ba7f9a50-acf6-4fba-a1b9-503baa80266b","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["\n","\n","\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n"]},{"name":"stderr","output_type":"stream","text":["/Users/lta/anaconda3/envs/cookbook/lib/python3.10/site-packages/langchain_core/_api/deprecation.py:119: LangChainDeprecationWarning: The method `Chain.run` was deprecated in langchain 0.1.0 and will be removed in 0.3.0. Use invoke instead.\n"," warn_deprecated(\n","Python REPL can execute arbitrary code. Use with caution.\n"]},{"name":"stdout","output_type":"stream","text":["\u001b[32;1m\u001b[1;3m我们可以使用Python的拼音库来将中文名字转换为拼音。\n","Action: Python_REPL\n","Action Input: \n","```python\n","from xpinyin import Pinyin\n","p = Pinyin()\n","names = ['小明', '小黄', '小红', '小蓝', '小橘', '小绿']\n","pinyins = [p.get_pinyin(name, '') for name in names]\n","print(pinyins)\n","```\u001b[0m\n","Observation: \u001b[36;1m\u001b[1;3m['xiaoming', 'xiaohuang', 'xiaohong', 'xiaolan', 'xiaoju', 'xiaolv']\n","\u001b[0m\n","Thought:\u001b[32;1m\u001b[1;3m我们已经成功将这些客户名字转换为拼音。\n","Final Answer: ['xiaoming', 'xiaohuang', 'xiaohong', 'xiaolan', 'xiaoju', 'xiaolv']\u001b[0m\n","\n","\u001b[1m> Finished chain.\u001b[0m\n"]},{"data":{"text/plain":["\"['xiaoming', 'xiaohuang', 'xiaohong', 'xiaolan', 'xiaoju', 'xiaolv']\""]},"execution_count":10,"metadata":{},"output_type":"execute_result"}],"source":["agent.run(f\"\"\"在这些客户名字转换为拼音\\\n","并打印输出列表: {customer_list}\\\n","思考过程请使用中文。\"\"\") "]},{"cell_type":"markdown","id":"16f530a5-8c50-4648-951f-4e65d15ca93f","metadata":{"jp-MarkdownHeadingCollapsed":true,"tags":[]},"source":["#### 3️⃣ 使用调试模式"]},{"cell_type":"markdown","id":"951048dc-0414-410d-a0e9-6ef32bbdda89","metadata":{"tags":[]},"source":["在调试(debug)模式下再次运行,我们可以把上面的6步分别对应到下面的具体流程\n","1. 模型对于接下来需要做什么,给出思考(Thought)\n"," - [chain/start] [1:chain:AgentExecutor] Entering Chain run with input
\n"," - [chain/start] [1:chain:AgentExecutor > 2:chain:LLMChain] Entering Chain run with input
\n"," - [llm/start] [1:chain:AgentExecutor > 2:chain:LLMChain > 3:llm:ChatOpenAI] Entering LLM run with input
\n"," - [llm/end] [1:chain:AgentExecutor > 2:chain:LLMChain > 3:llm:ChatOpenAI] [12.25s] Exiting LLM run with output
\n"," - [chain/end] [1:chain:AgentExecutor > 2:chain:LLMChain] [12.25s] Exiting Chain run with output
\n","2. 模型基于思考采取行动(Action), 因为使用的工具不同,Action的输出也和之前有所不同,这里输出的为python代码\n"," - [tool/start] [1:chain:AgentExecutor > 4:tool:Python REPL] Entering Tool run with input
\n"," - [tool/end] [1:chain:AgentExecutor > 4:tool:Python REPL] [2.2239999999999998ms] Exiting Tool run with output
\n","3. 模型得到观察(Observation) \n"," - [chain/start] [1:chain:AgentExecutor > 5:chain:LLMChain] Entering Chain run with input
\n","4. 基于观察,模型对于接下来需要做什么,给出思考(Thought) \n"," - [llm/start] [1:chain:AgentExecutor > 5:chain:LLMChain > 6:llm:ChatOpenAI] Entering LLM run with input
\n"," - [llm/end] [1:chain:AgentExecutor > 5:chain:LLMChain > 6:llm:ChatOpenAI] [6.94s] Exiting LLM run with output
\n"," \n","5. 给出最终答案(Final Answer) \n"," - [chain/end] [1:chain:AgentExecutor > 5:chain:LLMChain] [6.94s] Exiting Chain run with output
\n","6. 返回最终答案。\n"," - [chain/end] [1:chain:AgentExecutor] [19.20s] Exiting Chain run with output
"]},{"cell_type":"code","execution_count":11,"id":"92b8961b-b990-4540-a7f9-e4701dfaa616","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[chain:AgentExecutor] Entering Chain run with input:\n","\u001b[0m{\n"," \"input\": \"在这些客户名字转换为拼音并打印输出列表: ['小明', '小黄', '小红', '小蓝', '小橘', '小绿']思考过程请使用中文。\"\n","}\n","\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[chain:AgentExecutor > chain:LLMChain] Entering Chain run with input:\n","\u001b[0m{\n"," \"input\": \"在这些客户名字转换为拼音并打印输出列表: ['小明', '小黄', '小红', '小蓝', '小橘', '小绿']思考过程请使用中文。\",\n"," \"agent_scratchpad\": \"\",\n"," \"stop\": [\n"," \"\\nObservation:\",\n"," \"\\n\\tObservation:\"\n"," ]\n","}\n","\u001b[32;1m\u001b[1;3m[llm/start]\u001b[0m \u001b[1m[chain:AgentExecutor > chain:LLMChain > llm:ChatOpenAI] Entering LLM run with input:\n","\u001b[0m{\n"," \"prompts\": [\n"," \"Human: You are an agent designed to write and execute python code to answer questions.\\nYou have access to a python REPL, which you can use to execute python code.\\nIf you get an error, debug your code and try again.\\nOnly use the output of your code to answer the question. \\nYou might know the answer without running any code, but you should still run the code to get the answer.\\nIf it does not seem like you can write code to answer the question, just return \\\"I don't know\\\" as the answer.\\n\\n\\nPython_REPL - A Python shell. Use this to execute python commands. Input should be a valid python command. If you want to see the output of a value, you should print it out with `print(...)`.\\n\\nUse the following format:\\n\\nQuestion: the input question you must answer\\nThought: you should always think about what to do\\nAction: the action to take, should be one of [Python_REPL]\\nAction Input: the input to the action\\nObservation: the result of the action\\n... (this Thought/Action/Action Input/Observation can repeat N times)\\nThought: I now know the final answer\\nFinal Answer: the final answer to the original input question\\n\\nBegin!\\n\\nQuestion: 在这些客户名字转换为拼音并打印输出列表: ['小明', '小黄', '小红', '小蓝', '小橘', '小绿']思考过程请使用中文。\\nThought:\"\n"," ]\n","}\n","\u001b[36;1m\u001b[1;3m[llm/end]\u001b[0m \u001b[1m[chain:AgentExecutor > chain:LLMChain > llm:ChatOpenAI] [2.74s] Exiting LLM run with output:\n","\u001b[0m{\n"," \"generations\": [\n"," [\n"," {\n"," \"text\": \"我们可以使用Python的拼音库来将中文名字转换为拼音。\\nAction: Python_REPL\\nAction Input: \\n```python\\nfrom xpinyin import Pinyin\\np = Pinyin()\\nnames = ['小明', '小黄', '小红', '小蓝', '小橘', '小绿']\\npinyins = [p.get_pinyin(name, '') for name in names]\\nprint(pinyins)\\n```\",\n"," \"generation_info\": {\n"," \"finish_reason\": \"stop\",\n"," \"logprobs\": null\n"," },\n"," \"type\": \"ChatGeneration\",\n"," \"message\": {\n"," \"lc\": 1,\n"," \"type\": \"constructor\",\n"," \"id\": [\n"," \"langchain\",\n"," \"schema\",\n"," \"messages\",\n"," \"AIMessage\"\n"," ],\n"," \"kwargs\": {\n"," \"content\": \"我们可以使用Python的拼音库来将中文名字转换为拼音。\\nAction: Python_REPL\\nAction Input: \\n```python\\nfrom xpinyin import Pinyin\\np = Pinyin()\\nnames = ['小明', '小黄', '小红', '小蓝', '小橘', '小绿']\\npinyins = [p.get_pinyin(name, '') for name in names]\\nprint(pinyins)\\n```\",\n"," \"response_metadata\": {\n"," \"token_usage\": {\n"," \"completion_tokens\": 104,\n"," \"prompt_tokens\": 322,\n"," \"total_tokens\": 426\n"," },\n"," \"model_name\": \"gpt-3.5-turbo\",\n"," \"system_fingerprint\": null,\n"," \"finish_reason\": \"stop\",\n"," \"logprobs\": null\n"," },\n"," \"type\": \"ai\",\n"," \"id\": \"run-1d856a5f-0d28-4156-bda0-c1bfb8a1e22f-0\",\n"," \"tool_calls\": [],\n"," \"invalid_tool_calls\": []\n"," }\n"," }\n"," }\n"," ]\n"," ],\n"," \"llm_output\": {\n"," \"token_usage\": {\n"," \"completion_tokens\": 104,\n"," \"prompt_tokens\": 322,\n"," \"total_tokens\": 426\n"," },\n"," \"model_name\": \"gpt-3.5-turbo\",\n"," \"system_fingerprint\": null\n"," },\n"," \"run\": null\n","}\n","\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[chain:AgentExecutor > chain:LLMChain] [2.75s] Exiting Chain run with output:\n","\u001b[0m{\n"," \"text\": \"我们可以使用Python的拼音库来将中文名字转换为拼音。\\nAction: Python_REPL\\nAction Input: \\n```python\\nfrom xpinyin import Pinyin\\np = Pinyin()\\nnames = ['小明', '小黄', '小红', '小蓝', '小橘', '小绿']\\npinyins = [p.get_pinyin(name, '') for name in names]\\nprint(pinyins)\\n```\"\n","}\n","\u001b[32;1m\u001b[1;3m[tool/start]\u001b[0m \u001b[1m[chain:AgentExecutor > tool:Python_REPL] Entering Tool run with input:\n","\u001b[0m\"```python\n","from xpinyin import Pinyin\n","p = Pinyin()\n","names = ['小明', '小黄', '小红', '小蓝', '小橘', '小绿']\n","pinyins = [p.get_pinyin(name, '') for name in names]\n","print(pinyins)\n","```\"\n","\u001b[36;1m\u001b[1;3m[tool/end]\u001b[0m \u001b[1m[chain:AgentExecutor > tool:Python_REPL] [16ms] Exiting Tool run with output:\n","\u001b[0m\"['xiaoming', 'xiaohuang', 'xiaohong', 'xiaolan', 'xiaoju', 'xiaolv']\"\n","\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[chain:AgentExecutor > chain:LLMChain] Entering Chain run with input:\n","\u001b[0m{\n"," \"input\": \"在这些客户名字转换为拼音并打印输出列表: ['小明', '小黄', '小红', '小蓝', '小橘', '小绿']思考过程请使用中文。\",\n"," \"agent_scratchpad\": \"我们可以使用Python的拼音库来将中文名字转换为拼音。\\nAction: Python_REPL\\nAction Input: \\n```python\\nfrom xpinyin import Pinyin\\np = Pinyin()\\nnames = ['小明', '小黄', '小红', '小蓝', '小橘', '小绿']\\npinyins = [p.get_pinyin(name, '') for name in names]\\nprint(pinyins)\\n```\\nObservation: ['xiaoming', 'xiaohuang', 'xiaohong', 'xiaolan', 'xiaoju', 'xiaolv']\\n\\nThought:\",\n"," \"stop\": [\n"," \"\\nObservation:\",\n"," \"\\n\\tObservation:\"\n"," ]\n","}\n","\u001b[32;1m\u001b[1;3m[llm/start]\u001b[0m \u001b[1m[chain:AgentExecutor > chain:LLMChain > llm:ChatOpenAI] Entering LLM run with input:\n","\u001b[0m{\n"," \"prompts\": [\n"," \"Human: You are an agent designed to write and execute python code to answer questions.\\nYou have access to a python REPL, which you can use to execute python code.\\nIf you get an error, debug your code and try again.\\nOnly use the output of your code to answer the question. \\nYou might know the answer without running any code, but you should still run the code to get the answer.\\nIf it does not seem like you can write code to answer the question, just return \\\"I don't know\\\" as the answer.\\n\\n\\nPython_REPL - A Python shell. Use this to execute python commands. Input should be a valid python command. If you want to see the output of a value, you should print it out with `print(...)`.\\n\\nUse the following format:\\n\\nQuestion: the input question you must answer\\nThought: you should always think about what to do\\nAction: the action to take, should be one of [Python_REPL]\\nAction Input: the input to the action\\nObservation: the result of the action\\n... (this Thought/Action/Action Input/Observation can repeat N times)\\nThought: I now know the final answer\\nFinal Answer: the final answer to the original input question\\n\\nBegin!\\n\\nQuestion: 在这些客户名字转换为拼音并打印输出列表: ['小明', '小黄', '小红', '小蓝', '小橘', '小绿']思考过程请使用中文。\\nThought:我们可以使用Python的拼音库来将中文名字转换为拼音。\\nAction: Python_REPL\\nAction Input: \\n```python\\nfrom xpinyin import Pinyin\\np = Pinyin()\\nnames = ['小明', '小黄', '小红', '小蓝', '小橘', '小绿']\\npinyins = [p.get_pinyin(name, '') for name in names]\\nprint(pinyins)\\n```\\nObservation: ['xiaoming', 'xiaohuang', 'xiaohong', 'xiaolan', 'xiaoju', 'xiaolv']\\n\\nThought:\"\n"," ]\n","}\n","\u001b[36;1m\u001b[1;3m[llm/end]\u001b[0m \u001b[1m[chain:AgentExecutor > chain:LLMChain > llm:ChatOpenAI] [1.26s] Exiting LLM run with output:\n","\u001b[0m{\n"," \"generations\": [\n"," [\n"," {\n"," \"text\": \"我们成功地将中文名字转换为拼音。\\nFinal Answer: ['xiaoming', 'xiaohuang', 'xiaohong', 'xiaolan', 'xiaoju', 'xiaolv']\",\n"," \"generation_info\": {\n"," \"finish_reason\": \"stop\",\n"," \"logprobs\": null\n"," },\n"," \"type\": \"ChatGeneration\",\n"," \"message\": {\n"," \"lc\": 1,\n"," \"type\": \"constructor\",\n"," \"id\": [\n"," \"langchain\",\n"," \"schema\",\n"," \"messages\",\n"," \"AIMessage\"\n"," ],\n"," \"kwargs\": {\n"," \"content\": \"我们成功地将中文名字转换为拼音。\\nFinal Answer: ['xiaoming', 'xiaohuang', 'xiaohong', 'xiaolan', 'xiaoju', 'xiaolv']\",\n"," \"response_metadata\": {\n"," \"token_usage\": {\n"," \"completion_tokens\": 45,\n"," \"prompt_tokens\": 458,\n"," \"total_tokens\": 503\n"," },\n"," \"model_name\": \"gpt-3.5-turbo\",\n"," \"system_fingerprint\": null,\n"," \"finish_reason\": \"stop\",\n"," \"logprobs\": null\n"," },\n"," \"type\": \"ai\",\n"," \"id\": \"run-90afa3d5-a827-4279-bee2-4447fd057653-0\",\n"," \"tool_calls\": [],\n"," \"invalid_tool_calls\": []\n"," }\n"," }\n"," }\n"," ]\n"," ],\n"," \"llm_output\": {\n"," \"token_usage\": {\n"," \"completion_tokens\": 45,\n"," \"prompt_tokens\": 458,\n"," \"total_tokens\": 503\n"," },\n"," \"model_name\": \"gpt-3.5-turbo\",\n"," \"system_fingerprint\": null\n"," },\n"," \"run\": null\n","}\n","\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[chain:AgentExecutor > chain:LLMChain] [1.26s] Exiting Chain run with output:\n","\u001b[0m{\n"," \"text\": \"我们成功地将中文名字转换为拼音。\\nFinal Answer: ['xiaoming', 'xiaohuang', 'xiaohong', 'xiaolan', 'xiaoju', 'xiaolv']\"\n","}\n","\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[chain:AgentExecutor] [4.03s] Exiting Chain run with output:\n","\u001b[0m{\n"," \"output\": \"['xiaoming', 'xiaohuang', 'xiaohong', 'xiaolan', 'xiaoju', 'xiaolv']\"\n","}\n"]}],"source":["import langchain\n","langchain.debug=True\n","agent.run(f\"\"\"在这些客户名字转换为拼音\\\n","并打印输出列表: {customer_list}\\\n","思考过程请使用中文。\"\"\") \n","langchain.debug=False"]},{"cell_type":"markdown","id":"6b2ba50d-53c4-4eab-9467-98f562a072cc","metadata":{"tags":[]},"source":["## 三、 定义自己的工具并在代理中使用"]},{"cell_type":"markdown","id":"fb92f6e4-21ab-494d-9c22-d0678050fd37","metadata":{},"source":["### 3.1 创建和使用自定义时间工具"]},{"cell_type":"code","execution_count":12,"id":"4b7e80e4-48d5-48f0-abeb-3dc33678ed9e","metadata":{},"outputs":[],"source":["# 导入tool函数装饰器\n","from langchain.agents import tool\n","from datetime import date"]},{"cell_type":"markdown","id":"8990b3c6-fe05-45b6-9567-06baec267a99","metadata":{},"source":["#### 1️⃣ 使用tool函数装饰器构建自定义工具\n","tool函数装饰器可以应用用于任何函数,将函数转化为LongChain工具,使其成为代理可调用的工具。\n","\n","我们需要给函数加上非常详细的文档字符串, 使得代理知道在什么情况下、如何使用该函数/工具。\n","\n","比如下面的函数`time`,我们加上了详细的文档字符串\n","\n","```python\n","\"\"\"\n","返回今天的日期,用于任何与获取今天日期相关的问题。\n","输入应该始终是一个空字符串,该函数将始终返回今天的日期。\n","任何日期的计算应该在此函数之外进行。\n","\"\"\"\n","\n","```"]},{"cell_type":"code","execution_count":13,"id":"e22a823f-3749-4bda-bac3-1b892da77929","metadata":{},"outputs":[],"source":["@tool\n","def time(text: str) -> str:\n"," \"\"\"\n"," 返回今天的日期,用于任何需要知道今天日期的问题。\\\n"," 输入应该总是一个空字符串,\\\n"," 这个函数将总是返回今天的日期,任何日期计算应该在这个函数之外进行。\n"," \"\"\"\n"," return str(date.today())"]},{"cell_type":"markdown","id":"440cb348-0ec6-4999-a6cd-81cb48b8c546","metadata":{},"source":["#### 2️⃣ 初始化代理"]},{"cell_type":"code","execution_count":14,"id":"09c9f59e-c14c-41b9-b6bd-0e97e8be72ab","metadata":{},"outputs":[],"source":["agent= initialize_agent(\n"," tools=[time], #将刚刚创建的时间工具加入代理\n"," llm=llm, #初始化的模型\n"," agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION, #代理类型\n"," handle_parsing_errors=True, #处理解析错误\n"," verbose = True #输出中间步骤\n",")"]},{"cell_type":"markdown","id":"f1a10657-23f2-4f78-a247-73f272119eb4","metadata":{},"source":["#### 3️⃣ 使用代理询问今天的日期\n","**注**: 代理有时候可能会出错(该功能正在开发中)。如果出现错误,请尝试再次运行它。"]},{"cell_type":"code","execution_count":15,"id":"f868e30b-ec67-4d76-9dc9-81ecd3332115","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["\n","\n","\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n","\u001b[32;1m\u001b[1;3mQuestion: What is today's date?\n","\n","Thought: I can use the time tool to get today's date.\n","\n","Action:\n","```\n","{\n"," \"action\": \"time\",\n"," \"action_input\": \"\"\n","}\n","```\n","\n","\u001b[0m\n","Observation: \u001b[36;1m\u001b[1;3m2024-06-11\u001b[0m\n","Thought:\u001b[32;1m\u001b[1;3mFinal Answer: 2024-06-11\u001b[0m\n","\n","\u001b[1m> Finished chain.\u001b[0m\n"]},{"data":{"text/plain":["{'input': '今天的日期是?', 'output': '2024-06-11'}"]},"execution_count":15,"metadata":{},"output_type":"execute_result"}],"source":["agent(\"今天的日期是?\") "]},{"cell_type":"markdown","id":"923a199f-08e8-4e70-afb5-70cd3e123acf","metadata":{},"source":["✅ **总结**\n","\n","1. 模型对于接下来需要做什么,给出思考(Thought) \n"," 思考:我需要使用 time 工具来获取今天的日期
\n","\n","2. 模型基于思考采取行动(Action), 因为使用的工具不同,Action的输出也和之前有所不同,这里输出的为python代码\n"," 行动: 使用time工具,输入为空字符串
\n"," \n"," \n","3. 模型得到观察(Observation)\n"," 观测: 2023-07-04
\n"," \n"," \n"," \n","4. 基于观察,模型对于接下来需要做什么,给出思考(Thought)\n"," 思考: 我已成功使用 time 工具检索到了今天的日期
\n","\n","5. 给出最终答案(Final Answer)\n"," 最终答案: 今天的日期是2023-07-04.
\n","6. 返回最终答案。"]}],"metadata":{"kernelspec":{"display_name":"Python 3 (ipykernel)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.14"},"toc":{"base_numbering":1,"nav_menu":{},"number_sections":false,"sideBar":true,"skip_h1_title":false,"title_cell":"Table of Contents","title_sidebar":"Contents","toc_cell":false,"toc_position":{},"toc_section_display":true,"toc_window_display":true}},"nbformat":4,"nbformat_minor":5}
+{"cells": [{"cell_type": "markdown", "id": "2caa79ba-45e3-437c-9cb6-e433f443f0bf", "metadata": {}, "source": ["# \u7b2c\u4e03\u7ae0 \u4ee3\u7406\n", "\n", " - [\u4e00\u3001\u8bbe\u7f6eOpenAI API Key](#\u4e00\u3001\u8bbe\u7f6eOpenAI-API-Key)\n", " - [\u4e8c\u3001LangChain\u5185\u7f6e\u5de5\u5177](#\u4e8c\u3001LangChain\u5185\u7f6e\u5de5\u5177)\n", " - [2.1 \u4f7f\u7528llm-math\u548cwikipedia\u5de5\u5177](#2.1-\u4f7f\u7528llm-math\u548cwikipedia\u5de5\u5177)\n", " - [2.2 \u4f7f\u7528PythonREPLTool\u5de5\u5177](#2.2-\u4f7f\u7528PythonREPLTool\u5de5\u5177)\n", " - [\u4e09\u3001 \u5b9a\u4e49\u81ea\u5df1\u7684\u5de5\u5177\u5e76\u5728\u4ee3\u7406\u4e2d\u4f7f\u7528](#\u4e09\u3001-\u5b9a\u4e49\u81ea\u5df1\u7684\u5de5\u5177\u5e76\u5728\u4ee3\u7406\u4e2d\u4f7f\u7528)\n", " - [3.1 \u521b\u5efa\u548c\u4f7f\u7528\u81ea\u5b9a\u4e49\u65f6\u95f4\u5de5\u5177](#3.1-\u521b\u5efa\u548c\u4f7f\u7528\u81ea\u5b9a\u4e49\u65f6\u95f4\u5de5\u5177)\n"]}, {"cell_type": "markdown", "id": "8a31726a-82ac-46e4-a51e-7747fc4c6aed", "metadata": {"tags": []}, "source": ["## \u4e00\u3001\u8bbe\u7f6eOpenAI API Key\n", "\n", "\u767b\u9646 [OpenAI \u8d26\u6237](https://platform.openai.com/account/api-keys) \u83b7\u53d6API Key\uff0c\u7136\u540e\u5c06\u5176\u8bbe\u7f6e\u4e3a\u73af\u5883\u53d8\u91cf\u3002\n", "\n", "- \u5982\u679c\u4f60\u60f3\u8981\u8bbe\u7f6e\u4e3a\u5168\u5c40\u73af\u5883\u53d8\u91cf\uff0c\u53ef\u4ee5\u53c2\u8003[\u77e5\u4e4e\u6587\u7ae0](https://zhuanlan.zhihu.com/p/627665725)\u3002\n", "- \u5982\u679c\u4f60\u60f3\u8981\u8bbe\u7f6e\u4e3a\u672c\u5730/\u9879\u76ee\u73af\u5883\u53d8\u91cf\uff0c\u5728\u672c\u6587\u4ef6\u76ee\u5f55\u4e0b\u521b\u5efa`.env`\u6587\u4ef6, \u6253\u5f00\u6587\u4ef6\u8f93\u5165\u4ee5\u4e0b\u5185\u5bb9\u3002\n", "\n", " \n", " OPENAI_API_KEY=\"your_api_key\" \n", "
\n", " \n", " \u66ff\u6362\"your_api_key\"\u4e3a\u4f60\u81ea\u5df1\u7684 API Key"]}, {"cell_type": "code", "execution_count": 1, "id": "beebf047-9117-43e3-b7b2-1f06cd3d4275", "metadata": {}, "outputs": [], "source": ["# \u4e0b\u8f7d\u9700\u8981\u7684\u5305python-dotenv\u548copenai\n", "# \u5982\u679c\u4f60\u9700\u8981\u67e5\u770b\u5b89\u88c5\u8fc7\u7a0b\u65e5\u5fd7\uff0c\u53ef\u5220\u9664 -q \n", "!pip install -q python-dotenv\n", "!pip install -q openai"]}, {"cell_type": "code", "execution_count": 2, "id": "312f4c23-fe27-4cda-a645-8e1519c2febf", "metadata": {"tags": []}, "outputs": [], "source": ["import os\n", "import openai\n", "from dotenv import load_dotenv, find_dotenv\n", "\n", "# \u8bfb\u53d6\u672c\u5730/\u9879\u76ee\u7684\u73af\u5883\u53d8\u91cf\u3002\n", "\n", "# find_dotenv()\u5bfb\u627e\u5e76\u5b9a\u4f4d.env\u6587\u4ef6\u7684\u8def\u5f84\n", "# load_dotenv()\u8bfb\u53d6\u8be5.env\u6587\u4ef6\uff0c\u5e76\u5c06\u5176\u4e2d\u7684\u73af\u5883\u53d8\u91cf\u52a0\u8f7d\u5230\u5f53\u524d\u7684\u8fd0\u884c\u73af\u5883\u4e2d \n", "# \u5982\u679c\u4f60\u8bbe\u7f6e\u7684\u662f\u5168\u5c40\u7684\u73af\u5883\u53d8\u91cf\uff0c\u8fd9\u884c\u4ee3\u7801\u5219\u6ca1\u6709\u4efb\u4f55\u4f5c\u7528\u3002\n", "_ = load_dotenv(find_dotenv())\n", "\n", "# \u83b7\u53d6\u73af\u5883\u53d8\u91cf OPENAI_API_KEY\n", "openai.api_key = os.environ['OPENAI_API_KEY'] "]}, {"cell_type": "markdown", "id": "631c764b-68fa-483a-80a5-9a322cd1117c", "metadata": {}, "source": ["## \u4e8c\u3001LangChain\u5185\u7f6e\u5de5\u5177"]}, {"cell_type": "code", "execution_count": 2, "id": "8ab03352-1554-474c-afda-ec8974250e31", "metadata": {}, "outputs": [], "source": ["# \u5982\u679c\u9700\u8981\u67e5\u770b\u5b89\u88c5\u8fc7\u7a0b\u65e5\u5fd7\uff0c\u53ef\u5220\u9664 -q \n", "# -U \u5b89\u88c5\u5230\u6700\u65b0\u7248\u672c\u7684 wikipedia. \u5176\u529f\u80fd\u540c --upgrade \n", "!pip install -U -q wikipedia\n", "!pip install -q openai"]}, {"cell_type": "code", "execution_count": 3, "id": "bc80a8a7-a66e-4fbb-8d57-05004a3e77c7", "metadata": {}, "outputs": [], "source": ["from langchain.agents import load_tools, initialize_agent\n", "from langchain.agents import AgentType\n", "from langchain.python import PythonREPL\n", "from langchain.chat_models import ChatOpenAI"]}, {"cell_type": "markdown", "id": "d5a3655c-4d5e-4a86-8bf4-a11bd1525059", "metadata": {}, "source": ["### 2.1 \u4f7f\u7528llm-math\u548cwikipedia\u5de5\u5177"]}, {"cell_type": "markdown", "id": "e484d285-2110-4a06-a360-55313d6d9ffc", "metadata": {}, "source": ["#### 1\ufe0f\u20e3 \u521d\u59cb\u5316\u5927\u8bed\u8a00\u6a21\u578b\n", "\n", "- \u9ed8\u8ba4\u5bc6\u94a5`openai_api_key`\u4e3a\u73af\u5883\u53d8\u91cf`OPENAI_API_KEY`\u3002\u56e0\u6b64\u5728\u8fd0\u884c\u4ee5\u4e0b\u4ee3\u7801\u4e4b\u524d\uff0c\u786e\u4fdd\u4f60\u5df2\u7ecf\u8bbe\u7f6e\u73af\u5883\u53d8\u91cf`OPENAI_API_KEY`\u3002\u5982\u679c\u8fd8\u6ca1\u6709\u5bc6\u94a5\uff0c\u8bf7[\u83b7\u53d6\u4f60\u7684API Key](https://platform.openai.com/account/api-keys) \u3002\n", "- \u9ed8\u8ba4\u6a21\u578b`model_name`\u4e3a`gpt-3.5-turbo`\u3002\n", "- \u66f4\u591a\u5173\u4e8e\u6a21\u578b\u9ed8\u8ba4\u53c2\u6570\u8bf7\u67e5\u770b[\u8fd9\u91cc](https://github.com/hwchase17/langchain/blob/master/langchain/chat_models/openai.py)\u3002"]}, {"cell_type": "code", "execution_count": 4, "id": "71e5894f-fc50-478b-a3d3-58891cc46ffc", "metadata": {}, "outputs": [], "source": ["# \u53c2\u6570temperature\u8bbe\u7f6e\u4e3a0.0\uff0c\u4ece\u800c\u51cf\u5c11\u751f\u6210\u7b54\u6848\u7684\u968f\u673a\u6027\u3002\n", "llm = ChatOpenAI(temperature=0)"]}, {"cell_type": "markdown", "id": "d7f75023-1825-4d74-bc8e-2c362f551fd1", "metadata": {"tags": []}, "source": ["#### 2\ufe0f\u20e3 \u52a0\u8f7d\u5de5\u5177\u5305\n", "- `llm-math` \u5de5\u5177\u7ed3\u5408\u8bed\u8a00\u6a21\u578b\u548c\u8ba1\u7b97\u5668\u7528\u4ee5\u8fdb\u884c\u6570\u5b66\u8ba1\u7b97\n", "- `wikipedia`\u5de5\u5177\u901a\u8fc7API\u8fde\u63a5\u5230wikipedia\u8fdb\u884c\u641c\u7d22\u67e5\u8be2\u3002"]}, {"cell_type": "code", "execution_count": 5, "id": "3541b866-8a35-4b6f-bfe6-66edb1d666cd", "metadata": {}, "outputs": [], "source": ["tools = load_tools(\n", " [\"llm-math\",\"wikipedia\"], \n", " llm=llm #\u7b2c\u4e00\u6b65\u521d\u59cb\u5316\u7684\u6a21\u578b\n", ")"]}, {"cell_type": "markdown", "id": "e5b4fcc8-8817-4a94-b154-3f328480e441", "metadata": {}, "source": ["#### 3\ufe0f\u20e3 \u521d\u59cb\u5316\u4ee3\u7406\n", "\n", "- `agent`: \u4ee3\u7406\u7c7b\u578b\u3002\u8fd9\u91cc\u4f7f\u7528\u7684\u662f`AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION`\u3002\u5176\u4e2d`CHAT`\u4ee3\u8868\u4ee3\u7406\u6a21\u578b\u4e3a\u9488\u5bf9\u5bf9\u8bdd\u4f18\u5316\u7684\u6a21\u578b\uff0c`REACT`\u4ee3\u8868\u9488\u5bf9REACT\u8bbe\u8ba1\u7684\u63d0\u793a\u6a21\u7248\u3002\n", "- `handle_parsing_errors`: \u662f\u5426\u5904\u7406\u89e3\u6790\u9519\u8bef\u3002\u5f53\u53d1\u751f\u89e3\u6790\u9519\u8bef\u65f6\uff0c\u5c06\u9519\u8bef\u4fe1\u606f\u8fd4\u56de\u7ed9\u5927\u6a21\u578b\uff0c\u8ba9\u5176\u8fdb\u884c\u7ea0\u6b63\u3002\n", "- `verbose`: \u662f\u5426\u8f93\u51fa\u4e2d\u95f4\u6b65\u9aa4\u7ed3\u679c\u3002"]}, {"cell_type": "code", "execution_count": 6, "id": "e7cdc782-2b8c-4649-ae1e-cabacee1a757", "metadata": {}, "outputs": [], "source": ["agent= initialize_agent(\n", " tools, #\u7b2c\u4e8c\u6b65\u52a0\u8f7d\u7684\u5de5\u5177\n", " llm, #\u7b2c\u4e00\u6b65\u521d\u59cb\u5316\u7684\u6a21\u578b\n", " agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION, #\u4ee3\u7406\u7c7b\u578b\n", " handle_parsing_errors=True, #\u5904\u7406\u89e3\u6790\u9519\u8bef\n", " verbose = True #\u8f93\u51fa\u4e2d\u95f4\u6b65\u9aa4\n", ")"]}, {"cell_type": "markdown", "id": "b980137a-a1d2-4c19-80c8-ec380f9c1efe", "metadata": {"tags": []}, "source": ["#### 4\ufe0f\u20e3.1\ufe0f\u20e3 \u4f7f\u7528\u4ee3\u7406\u56de\u7b54\u6570\u5b66\u95ee\u9898"]}, {"cell_type": "code", "execution_count": 7, "id": "1e2bcd3b-ae72-4fa1-b38e-d0ce650e1f31", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["\n", "\n", "\u001b[1m> Entering new chain...\u001b[0m\n", "\u001b[32;1m\u001b[1;3mQuestion: \u8ba1\u7b97300\u768425%\n", "Thought: \u6211\u53ef\u4ee5\u4f7f\u7528\u8ba1\u7b97\u5668\u6765\u8ba1\u7b97\u8fd9\u4e2a\u767e\u5206\u6bd4\n", "Action:\n", "```\n", "{\n", " \"action\": \"Calculator\",\n", " \"action_input\": \"300 * 25 / 100\"\n", "}\n", "```\n", "\u001b[0m\n", "Observation: \u001b[36;1m\u001b[1;3mAnswer: 75.0\u001b[0m\n", "Thought:\u001b[32;1m\u001b[1;3m\u6211\u73b0\u5728\u77e5\u9053\u6700\u7ec8\u7b54\u6848\u4e86\n", "Final Answer: 300\u768425%\u662f75.0\u001b[0m\n", "\n", "\u001b[1m> Finished chain.\u001b[0m\n"]}, {"data": {"text/plain": ["{'input': '\u8ba1\u7b97300\u768425%\uff0c\u8bf7\u7528\u4e2d\u6587', 'output': '300\u768425%\u662f75.0'}"]}, "execution_count": 7, "metadata": {}, "output_type": "execute_result"}], "source": ["agent(\"\u8ba1\u7b97300\u768425%\uff0c\u601d\u8003\u8fc7\u7a0b\u8bf7\u4f7f\u7528\u4e2d\u6587\u3002\") "]}, {"cell_type": "markdown", "id": "05bb0811-d71e-4016-868b-efbb651d8e59", "metadata": {}, "source": ["\u2705 **\u603b\u7ed3**\n", "\n", "1. \u6a21\u578b\u5bf9\u4e8e\u63a5\u4e0b\u6765\u9700\u8981\u505a\u4ec0\u4e48\uff0c\u7ed9\u51fa\u601d\u8003\uff08Thought\uff09 \n", " \n", " \u601d\u8003\uff1a\u6211\u4eec\u9700\u8981\u8ba1\u7b97300\u768425%\uff0c\u8fd9\u4e2a\u8fc7\u7a0b\u4e2d\u9700\u8981\u7528\u5230\u4e58\u6cd5\u548c\u9664\u6cd5\u3002
\n", "\n", "2. \u6a21\u578b\u57fa\u4e8e\u601d\u8003\u91c7\u53d6\u884c\u52a8\uff08Action\uff09\n", " \u884c\u52a8: \u4f7f\u7528\u8ba1\u7b97\u5668\uff08calculator\uff09\uff0c\u8f93\u5165300*0.25
\n", "3. \u6a21\u578b\u5f97\u5230\u89c2\u5bdf\uff08Observation\uff09\n", " \u89c2\u5bdf\uff1a\u7b54\u6848: 75.0
\n", "\n", "4. \u57fa\u4e8e\u89c2\u5bdf\uff0c\u6a21\u578b\u5bf9\u4e8e\u63a5\u4e0b\u6765\u9700\u8981\u505a\u4ec0\u4e48\uff0c\u7ed9\u51fa\u601d\u8003\uff08Thought\uff09\n", " \u601d\u8003: \u6211\u4eec\u7684\u95ee\u9898\u6709\u4e86\u7b54\u6848
\n", "\n", "5. \u7ed9\u51fa\u6700\u7ec8\u7b54\u6848\uff08Final Answer\uff09\n", " \u6700\u7ec8\u7b54\u6848: 75.0
\n", "5. \u4ee5\u5b57\u5178\u7684\u5f62\u5f0f\u7ed9\u51fa\u6700\u7ec8\u7b54\u6848\u3002"]}, {"cell_type": "markdown", "id": "bdd13d06-0f0e-4918-ac28-339f524f8c76", "metadata": {}, "source": ["#### 4\ufe0f\u20e3.2\ufe0f\u20e3 Tom M. Mitchell\u7684\u4e66"]}, {"cell_type": "code", "execution_count": 18, "id": "f43b5cbf-1011-491b-90e8-09cf903fb866", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["\n", "\n", "\u001b[1m> Entering new chain...\u001b[0m\n", "\u001b[32;1m\u001b[1;3mI need to find out what book Tom M. Mitchell wrote.\n", "Action: Search for Tom M. Mitchell's books.\n", "Action Input: \"Tom M. Mitchell books\"\u001b[0m\n", "Observation: Search for Tom M. Mitchell's books. is not a valid tool, try another one.\n", "Thought:\u001b[32;1m\u001b[1;3mI can try searching for Tom M. Mitchell's books using a search engine.\n", "Action: Search for \"Tom M. Mitchell books\"\n", "Action Input: \"Tom M. Mitchell books\"\u001b[0m\n", "Observation: Search for \"Tom M. Mitchell books\" is not a valid tool, try another one.\n", "Thought:\u001b[32;1m\u001b[1;3mI can try searching for Tom M. Mitchell's books using a search engine.\n", "Action: Search for \"Tom M. Mitchell books\"\n", "Action Input: \"Tom M. Mitchell books\"\u001b[0m\n", "Observation: Search for \"Tom M. Mitchell books\" is not a valid tool, try another one.\n", "Thought:"]}, {"name": "stderr", "output_type": "stream", "text": ["Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 2.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 8.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n"]}, {"name": "stdout", "output_type": "stream", "text": ["\u001b[32;1m\u001b[1;3mI can try searching for Tom M. Mitchell's books using a search engine.\n", "Action: Search for \"Tom M. Mitchell books\"\n", "Action Input: \"Tom M. Mitchell books\"\u001b[0m\n", "Observation: Search for \"Tom M. Mitchell books\" is not a valid tool, try another one.\n", "Thought:"]}, {"name": "stderr", "output_type": "stream", "text": ["Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 2.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 8.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n"]}, {"name": "stdout", "output_type": "stream", "text": ["\u001b[32;1m\u001b[1;3mI can try searching for Tom M. Mitchell's books using a search engine.\n", "Action: Search for \"Tom M. Mitchell books\"\n", "Action Input: \"Tom M. Mitchell books\"\u001b[0m\n", "Observation: Search for \"Tom M. Mitchell books\" is not a valid tool, try another one.\n", "Thought:"]}, {"name": "stderr", "output_type": "stream", "text": ["Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 2.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 8.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n"]}, {"name": "stdout", "output_type": "stream", "text": ["\u001b[32;1m\u001b[1;3mI can try searching for Tom M. Mitchell's books using a search engine.\n", "Action: Search for \"Tom M. Mitchell books\"\n", "Action Input: \"Tom M. Mitchell books\"\u001b[0m\n", "Observation: Search for \"Tom M. Mitchell books\" is not a valid tool, try another one.\n", "Thought:"]}, {"name": "stderr", "output_type": "stream", "text": ["Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 2.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 8.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n"]}, {"name": "stdout", "output_type": "stream", "text": ["\u001b[32;1m\u001b[1;3mI can try searching for Tom M. Mitchell's books using a search engine.\n", "Action: Search for \"Tom M. Mitchell books\"\n", "Action Input: \"Tom M. Mitchell books\"\u001b[0m\n", "Observation: Search for \"Tom M. Mitchell books\" is not a valid tool, try another one.\n", "Thought:"]}, {"name": "stderr", "output_type": "stream", "text": ["Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 2.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 8.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n"]}, {"name": "stdout", "output_type": "stream", "text": ["\u001b[32;1m\u001b[1;3mI can try searching for Tom M. Mitchell's books using a search engine.\n", "Action: Search for \"Tom M. Mitchell books\"\n", "Action Input: \"Tom M. Mitchell books\"\u001b[0m\n", "Observation: Search for \"Tom M. Mitchell books\" is not a valid tool, try another one.\n", "Thought:"]}, {"name": "stderr", "output_type": "stream", "text": ["Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 2.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 8.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n"]}, {"name": "stdout", "output_type": "stream", "text": ["\u001b[32;1m\u001b[1;3mI can try searching for Tom M. Mitchell's books using a search engine.\n", "Action: Search for \"Tom M. Mitchell books\"\n", "Action Input: \"Tom M. Mitchell books\"\u001b[0m\n", "Observation: Search for \"Tom M. Mitchell books\" is not a valid tool, try another one.\n", "Thought:"]}, {"name": "stderr", "output_type": "stream", "text": ["Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 2.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 8.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n"]}, {"name": "stdout", "output_type": "stream", "text": ["\u001b[32;1m\u001b[1;3mI can try searching for Tom M. Mitchell's books using a search engine.\n", "Action: Search for \"Tom M. Mitchell books\"\n", "Action Input: \"Tom M. Mitchell books\"\u001b[0m\n", "Observation: Search for \"Tom M. Mitchell books\" is not a valid tool, try another one.\n", "Thought:"]}, {"name": "stderr", "output_type": "stream", "text": ["Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 2.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 8.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n"]}, {"name": "stdout", "output_type": "stream", "text": ["\u001b[32;1m\u001b[1;3mI can try searching for Tom M. Mitchell's books using a search engine.\n", "Action: Search for \"Tom M. Mitchell books\"\n", "Action Input: \"Tom M. Mitchell books\"\u001b[0m\n", "Observation: Search for \"Tom M. Mitchell books\" is not a valid tool, try another one.\n", "Thought:"]}, {"name": "stderr", "output_type": "stream", "text": ["Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 2.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 8.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n"]}, {"name": "stdout", "output_type": "stream", "text": ["\u001b[32;1m\u001b[1;3mI can try searching for Tom M. Mitchell's books using a search engine.\n", "Action: Search for \"Tom M. Mitchell books\"\n", "Action Input: \"Tom M. Mitchell books\"\u001b[0m\n", "Observation: Search for \"Tom M. Mitchell books\" is not a valid tool, try another one.\n", "Thought:"]}, {"name": "stderr", "output_type": "stream", "text": ["Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 2.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 8.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n"]}, {"name": "stdout", "output_type": "stream", "text": ["\u001b[32;1m\u001b[1;3mI can try searching for Tom M. Mitchell's books using a search engine.\n", "Action: Search for \"Tom M. Mitchell books\"\n", "Action Input: \"Tom M. Mitchell books\"\u001b[0m\n", "Observation: Search for \"Tom M. Mitchell books\" is not a valid tool, try another one.\n", "Thought:"]}, {"name": "stderr", "output_type": "stream", "text": ["Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 2.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 8.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n"]}, {"name": "stdout", "output_type": "stream", "text": ["\u001b[32;1m\u001b[1;3mI can try searching for Tom M. Mitchell's books using a search engine.\n", "Action: Search for \"Tom M. Mitchell books\"\n", "Action Input: \"Tom M. Mitchell books\"\u001b[0m\n", "Observation: Search for \"Tom M. Mitchell books\" is not a valid tool, try another one.\n", "Thought:"]}, {"name": "stderr", "output_type": "stream", "text": ["Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 2.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 8.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n"]}, {"name": "stdout", "output_type": "stream", "text": ["\u001b[32;1m\u001b[1;3mI can try searching for Tom M. Mitchell's books using a search engine.\n", "Action: Search for \"Tom M. Mitchell books\"\n", "Action Input: \"Tom M. Mitchell books\"\u001b[0m\n", "Observation: Search for \"Tom M. Mitchell books\" is not a valid tool, try another one.\n", "Thought:\u001b[32;1m\u001b[1;3m\u001b[0m\n", "\n", "\u001b[1m> Finished chain.\u001b[0m\n"]}, {"data": {"text/plain": ["{'input': 'Tom M. Mitchell is an American computer scientist and the Founders University Professor at Carnegie Mellon University (CMU)what book did he write?',\n", " 'output': 'Agent stopped due to iteration limit or time limit.'}"]}, "execution_count": 18, "metadata": {}, "output_type": "execute_result"}], "source": ["question = \"Tom M. Mitchell is an American computer scientist \\\n", "and the Founders University Professor at Carnegie Mellon University (CMU)\\\n", "what book did he write?\"\n", "agent(question) "]}, {"cell_type": "markdown", "id": "39e2f6fb-f229-4235-ad58-98a5f505800c", "metadata": {}, "source": ["\u2705 **\u603b\u7ed3**\n", "\n", "1. \u6a21\u578b\u5bf9\u4e8e\u63a5\u4e0b\u6765\u9700\u8981\u505a\u4ec0\u4e48\uff0c\u7ed9\u51fa\u601d\u8003\uff08Thought\uff09 \n", " \u601d\u8003\uff1a\u6211\u5e94\u8be5\u4f7f\u7528\u7ef4\u57fa\u767e\u79d1\u53bb\u641c\u7d22\u3002
\n", "\n", "2. \u6a21\u578b\u57fa\u4e8e\u601d\u8003\u91c7\u53d6\u884c\u52a8\uff08Action\uff09\n", " \u884c\u52a8: \u4f7f\u7528\u7ef4\u57fa\u767e\u79d1\uff0c\u8f93\u5165Tom M. Mitchell
\n", "3. \u6a21\u578b\u5f97\u5230\u89c2\u5bdf\uff08Observation\uff09\n", " \u89c2\u6d4b: \u9875\u9762: Tom M. Mitchell\uff0c\u9875\u9762: Tom Mitchell (\u6fb3\u5927\u5229\u4e9a\u8db3\u7403\u8fd0\u52a8\u5458)
\n", "\n", "4. \u57fa\u4e8e\u89c2\u5bdf\uff0c\u6a21\u578b\u5bf9\u4e8e\u63a5\u4e0b\u6765\u9700\u8981\u505a\u4ec0\u4e48\uff0c\u7ed9\u51fa\u601d\u8003\uff08Thought\uff09\n", " \u601d\u8003: Tom M. Mitchell\u5199\u7684\u4e66\u662fMachine Learning
\n", "\n", "5. \u7ed9\u51fa\u6700\u7ec8\u7b54\u6848\uff08Final Answer\uff09\n", " \u6700\u7ec8\u7b54\u6848: Machine Learning
\n", "5. \u4ee5\u5b57\u5178\u7684\u5f62\u5f0f\u7ed9\u51fa\u6700\u7ec8\u7b54\u6848\u3002\n", "\n", "\n", "\u503c\u5f97\u6ce8\u610f\u7684\u662f\uff0c\u6a21\u578b\u6bcf\u6b21\u8fd0\u884c\u63a8\u7406\u7684\u8fc7\u7a0b\u53ef\u80fd\u5b58\u5728\u5dee\u5f02\uff0c\u4f46\u6700\u7ec8\u7684\u7ed3\u679c\u4e00\u81f4\u3002"]}, {"cell_type": "markdown", "id": "5901cab6-a7c9-4590-b35d-d41c29e39a39", "metadata": {}, "source": ["### 2.2 \u4f7f\u7528PythonREPLTool\u5de5\u5177"]}, {"cell_type": "markdown", "id": "12b8d837-524c-46b8-9189-504808cf1f93", "metadata": {}, "source": ["#### 1\ufe0f\u20e3 \u521b\u5efapyhon\u4ee3\u7406"]}, {"cell_type": "code", "execution_count": 8, "id": "6c60d933-168d-4834-b29c-b2b43ade80cc", "metadata": {}, "outputs": [], "source": ["from langchain.agents.agent_toolkits import create_python_agent\n", "from langchain.tools.python.tool import PythonREPLTool\n", "\n", "agent = create_python_agent(\n", " llm, #\u4f7f\u7528\u524d\u9762\u4e00\u8282\u5df2\u7ecf\u52a0\u8f7d\u7684\u5927\u8bed\u8a00\u6a21\u578b\n", " tool=PythonREPLTool(), #\u4f7f\u7528Python\u4ea4\u4e92\u5f0f\u73af\u5883\u5de5\u5177\uff08REPLTool\uff09\n", " verbose=True #\u8f93\u51fa\u4e2d\u95f4\u6b65\u9aa4\n", ")"]}, {"cell_type": "markdown", "id": "d32ed40c-cbcd-4efd-b044-57e611f5fab5", "metadata": {"tags": []}, "source": ["#### 2\ufe0f\u20e3 \u4f7f\u7528\u4ee3\u7406\u5bf9\u987e\u5ba2\u540d\u5b57\u8fdb\u884c\u6392\u5e8f"]}, {"cell_type": "code", "execution_count": 14, "id": "3d416f28-8fdb-4761-a4f9-d46c6d37fb3d", "metadata": {}, "outputs": [], "source": ["customer_list = [\"\u5c0f\u660e\",\"\u5c0f\u9ec4\",\"\u5c0f\u7ea2\",\"\u5c0f\u84dd\",\"\u5c0f\u6a58\",\"\u5c0f\u7eff\",]"]}, {"cell_type": "code", "execution_count": 22, "id": "d0fbfc4c", "metadata": {}, "outputs": [], "source": ["!pip install -q pinyin"]}, {"cell_type": "code", "execution_count": 23, "id": "ba7f9a50-acf6-4fba-a1b9-503baa80266b", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["\n", "\n", "\u001b[1m> Entering new chain...\u001b[0m\n", "\u001b[32;1m\u001b[1;3m\u6211\u9700\u8981\u4f7f\u7528\u62fc\u97f3\u5e93\u6765\u5c06\u5ba2\u6237\u540d\u5b57\u8f6c\u6362\u4e3a\u62fc\u97f3\u3002\u6211\u53ef\u4ee5\u4f7f\u7528Python\u7684pinyin\u5e93\u6765\u5b9e\u73b0\u8fd9\u4e2a\u529f\u80fd\u3002\n", "Action: Python_REPL\n", "Action Input: import pinyin\u001b[0m\n", "Observation: \u001b[36;1m\u001b[1;3m\u001b[0m\n", "Thought:\u001b[32;1m\u001b[1;3m\u6211\u73b0\u5728\u53ef\u4ee5\u4f7f\u7528pinyin\u5e93\u6765\u5c06\u5ba2\u6237\u540d\u5b57\u8f6c\u6362\u4e3a\u62fc\u97f3\u4e86\u3002\n", "Action: Python_REPL\n", "Action Input: names = ['\u5c0f\u660e', '\u5c0f\u9ec4', '\u5c0f\u7ea2', '\u5c0f\u84dd', '\u5c0f\u6a58', '\u5c0f\u7eff']\u001b[0m\n", "Observation: \u001b[36;1m\u001b[1;3m\u001b[0m\n", "Thought:\u001b[32;1m\u001b[1;3m\u6211\u73b0\u5728\u53ef\u4ee5\u4f7f\u7528pinyin\u5e93\u5c06names\u5217\u8868\u4e2d\u7684\u5ba2\u6237\u540d\u5b57\u8f6c\u6362\u4e3a\u62fc\u97f3\u4e86\u3002\n", "Action: Python_REPL\n", "Action Input: pinyin_names = [pinyin.get(i, format='strip', delimiter='') for i in names]\u001b[0m\n", "Observation: \u001b[36;1m\u001b[1;3m\u001b[0m\n", "Thought:"]}, {"name": "stderr", "output_type": "stream", "text": ["Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 2.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 8.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n"]}, {"name": "stdout", "output_type": "stream", "text": ["\u001b[32;1m\u001b[1;3m\u6211\u5df2\u7ecf\u6210\u529f\u5c06\u5ba2\u6237\u540d\u5b57\u8f6c\u6362\u4e3a\u62fc\u97f3\u5e76\u5b58\u50a8\u5728pinyin_names\u5217\u8868\u4e2d\u4e86\u3002\n", "Final Answer: pinyin_names = ['xiaoming', 'xiaohuang', 'xiaohong', 'xiaolan', 'xiaojv', 'xiaolv']\u001b[0m\n", "\n", "\u001b[1m> Finished chain.\u001b[0m\n"]}, {"data": {"text/plain": ["\"pinyin_names = ['xiaoming', 'xiaohuang', 'xiaohong', 'xiaolan', 'xiaojv', 'xiaolv']\""]}, "execution_count": 23, "metadata": {}, "output_type": "execute_result"}], "source": ["agent.run(f\"\"\"\u5728\u8fd9\u4e9b\u5ba2\u6237\u540d\u5b57\u8f6c\u6362\u4e3a\u62fc\u97f3\\\n", "\u5e76\u6253\u5370\u8f93\u51fa\u5217\u8868\uff1a {customer_list}\\\n", "\u601d\u8003\u8fc7\u7a0b\u8bf7\u4f7f\u7528\u4e2d\u6587\u3002\"\"\") "]}, {"cell_type": "markdown", "id": "16f530a5-8c50-4648-951f-4e65d15ca93f", "metadata": {"jp-MarkdownHeadingCollapsed": true, "tags": []}, "source": ["#### 3\ufe0f\u20e3 \u4f7f\u7528\u8c03\u8bd5\u6a21\u5f0f"]}, {"cell_type": "markdown", "id": "951048dc-0414-410d-a0e9-6ef32bbdda89", "metadata": {"tags": []}, "source": ["\u5728\u8c03\u8bd5\uff08debug\uff09\u6a21\u5f0f\u4e0b\u518d\u6b21\u8fd0\u884c\uff0c\u6211\u4eec\u53ef\u4ee5\u628a\u4e0a\u9762\u76846\u6b65\u5206\u522b\u5bf9\u5e94\u5230\u4e0b\u9762\u7684\u5177\u4f53\u6d41\u7a0b\n", "1. \u6a21\u578b\u5bf9\u4e8e\u63a5\u4e0b\u6765\u9700\u8981\u505a\u4ec0\u4e48\uff0c\u7ed9\u51fa\u601d\u8003\uff08Thought\uff09\n", " - [chain/start] [1:chain:AgentExecutor] Entering Chain run with input
\n", " - [chain/start] [1:chain:AgentExecutor > 2:chain:LLMChain] Entering Chain run with input
\n", " - [llm/start] [1:chain:AgentExecutor > 2:chain:LLMChain > 3:llm:ChatOpenAI] Entering LLM run with input
\n", " - [llm/end] [1:chain:AgentExecutor > 2:chain:LLMChain > 3:llm:ChatOpenAI] [12.25s] Exiting LLM run with output
\n", " - [chain/end] [1:chain:AgentExecutor > 2:chain:LLMChain] [12.25s] Exiting Chain run with output
\n", "2. \u6a21\u578b\u57fa\u4e8e\u601d\u8003\u91c7\u53d6\u884c\u52a8\uff08Action), \u56e0\u4e3a\u4f7f\u7528\u7684\u5de5\u5177\u4e0d\u540c\uff0cAction\u7684\u8f93\u51fa\u4e5f\u548c\u4e4b\u524d\u6709\u6240\u4e0d\u540c\uff0c\u8fd9\u91cc\u8f93\u51fa\u7684\u4e3apython\u4ee3\u7801\n", " - [tool/start] [1:chain:AgentExecutor > 4:tool:Python REPL] Entering Tool run with input
\n", " - [tool/end] [1:chain:AgentExecutor > 4:tool:Python REPL] [2.2239999999999998ms] Exiting Tool run with output
\n", "3. \u6a21\u578b\u5f97\u5230\u89c2\u5bdf\uff08Observation\uff09 \n", " - [chain/start] [1:chain:AgentExecutor > 5:chain:LLMChain] Entering Chain run with input
\n", "4. \u57fa\u4e8e\u89c2\u5bdf\uff0c\u6a21\u578b\u5bf9\u4e8e\u63a5\u4e0b\u6765\u9700\u8981\u505a\u4ec0\u4e48\uff0c\u7ed9\u51fa\u601d\u8003\uff08Thought\uff09 \n", " - [llm/start] [1:chain:AgentExecutor > 5:chain:LLMChain > 6:llm:ChatOpenAI] Entering LLM run with input
\n", " - [llm/end] [1:chain:AgentExecutor > 5:chain:LLMChain > 6:llm:ChatOpenAI] [6.94s] Exiting LLM run with output
\n", " \n", "5. \u7ed9\u51fa\u6700\u7ec8\u7b54\u6848\uff08Final Answer\uff09 \n", " - [chain/end] [1:chain:AgentExecutor > 5:chain:LLMChain] [6.94s] Exiting Chain run with output
\n", "6. \u8fd4\u56de\u6700\u7ec8\u7b54\u6848\u3002\n", " - [chain/end] [1:chain:AgentExecutor] [19.20s] Exiting Chain run with output
"]}, {"cell_type": "code", "execution_count": 25, "id": "92b8961b-b990-4540-a7f9-e4701dfaa616", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor] Entering Chain run with input:\n", "\u001b[0m{\n", " \"input\": \"\u5728\u8fd9\u4e9b\u5ba2\u6237\u540d\u5b57\u8f6c\u6362\u4e3a\u62fc\u97f3\u5e76\u6253\u5370\u8f93\u51fa\u5217\u8868\uff1a ['\u5c0f\u660e', '\u5c0f\u9ec4', '\u5c0f\u7ea2', '\u5c0f\u84dd', '\u5c0f\u6a58', '\u5c0f\u7eff']\u601d\u8003\u8fc7\u7a0b\u8bf7\u4f7f\u7528\u4e2d\u6587\u3002\"\n", "}\n", "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 2:chain:LLMChain] Entering Chain run with input:\n", "\u001b[0m{\n", " \"input\": \"\u5728\u8fd9\u4e9b\u5ba2\u6237\u540d\u5b57\u8f6c\u6362\u4e3a\u62fc\u97f3\u5e76\u6253\u5370\u8f93\u51fa\u5217\u8868\uff1a ['\u5c0f\u660e', '\u5c0f\u9ec4', '\u5c0f\u7ea2', '\u5c0f\u84dd', '\u5c0f\u6a58', '\u5c0f\u7eff']\u601d\u8003\u8fc7\u7a0b\u8bf7\u4f7f\u7528\u4e2d\u6587\u3002\",\n", " \"agent_scratchpad\": \"\",\n", " \"stop\": [\n", " \"\\nObservation:\",\n", " \"\\n\\tObservation:\"\n", " ]\n", "}\n", "\u001b[32;1m\u001b[1;3m[llm/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 2:chain:LLMChain > 3:llm:ChatOpenAI] Entering LLM run with input:\n", "\u001b[0m{\n", " \"prompts\": [\n", " \"Human: You are an agent designed to write and execute python code to answer questions.\\nYou have access to a python REPL, which you can use to execute python code.\\nIf you get an error, debug your code and try again.\\nOnly use the output of your code to answer the question. \\nYou might know the answer without running any code, but you should still run the code to get the answer.\\nIf it does not seem like you can write code to answer the question, just return \\\"I don't know\\\" as the answer.\\n\\n\\nPython_REPL: A Python shell. Use this to execute python commands. Input should be a valid python command. If you want to see the output of a value, you should print it out with `print(...)`.\\n\\nUse the following format:\\n\\nQuestion: the input question you must answer\\nThought: you should always think about what to do\\nAction: the action to take, should be one of [Python_REPL]\\nAction Input: the input to the action\\nObservation: the result of the action\\n... (this Thought/Action/Action Input/Observation can repeat N times)\\nThought: I now know the final answer\\nFinal Answer: the final answer to the original input question\\n\\nBegin!\\n\\nQuestion: \u5728\u8fd9\u4e9b\u5ba2\u6237\u540d\u5b57\u8f6c\u6362\u4e3a\u62fc\u97f3\u5e76\u6253\u5370\u8f93\u51fa\u5217\u8868\uff1a ['\u5c0f\u660e', '\u5c0f\u9ec4', '\u5c0f\u7ea2', '\u5c0f\u84dd', '\u5c0f\u6a58', '\u5c0f\u7eff']\u601d\u8003\u8fc7\u7a0b\u8bf7\u4f7f\u7528\u4e2d\u6587\u3002\\nThought:\"\n", " ]\n", "}\n", "\u001b[36;1m\u001b[1;3m[llm/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 2:chain:LLMChain > 3:llm:ChatOpenAI] [2.91s] Exiting LLM run with output:\n", "\u001b[0m{\n", " \"generations\": [\n", " [\n", " {\n", " \"text\": \"\u6211\u9700\u8981\u4f7f\u7528\u62fc\u97f3\u8f6c\u6362\u5e93\u6765\u5c06\u8fd9\u4e9b\u5ba2\u6237\u540d\u5b57\u8f6c\u6362\u4e3a\u62fc\u97f3\u3002\u6211\u53ef\u4ee5\u4f7f\u7528Python\u7684pinyin\u5e93\u6765\u5b8c\u6210\u8fd9\u4e2a\u4efb\u52a1\u3002\\nAction: Python_REPL\\nAction Input: import pinyin\",\n", " \"generation_info\": null,\n", " \"message\": {\n", " \"content\": \"\u6211\u9700\u8981\u4f7f\u7528\u62fc\u97f3\u8f6c\u6362\u5e93\u6765\u5c06\u8fd9\u4e9b\u5ba2\u6237\u540d\u5b57\u8f6c\u6362\u4e3a\u62fc\u97f3\u3002\u6211\u53ef\u4ee5\u4f7f\u7528Python\u7684pinyin\u5e93\u6765\u5b8c\u6210\u8fd9\u4e2a\u4efb\u52a1\u3002\\nAction: Python_REPL\\nAction Input: import pinyin\",\n", " \"additional_kwargs\": {},\n", " \"example\": false\n", " }\n", " }\n", " ]\n", " ],\n", " \"llm_output\": {\n", " \"token_usage\": {\n", " \"prompt_tokens\": 322,\n", " \"completion_tokens\": 50,\n", " \"total_tokens\": 372\n", " },\n", " \"model_name\": \"gpt-3.5-turbo\"\n", " },\n", " \"run\": null\n", "}\n", "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 2:chain:LLMChain] [2.91s] Exiting Chain run with output:\n", "\u001b[0m{\n", " \"text\": \"\u6211\u9700\u8981\u4f7f\u7528\u62fc\u97f3\u8f6c\u6362\u5e93\u6765\u5c06\u8fd9\u4e9b\u5ba2\u6237\u540d\u5b57\u8f6c\u6362\u4e3a\u62fc\u97f3\u3002\u6211\u53ef\u4ee5\u4f7f\u7528Python\u7684pinyin\u5e93\u6765\u5b8c\u6210\u8fd9\u4e2a\u4efb\u52a1\u3002\\nAction: Python_REPL\\nAction Input: import pinyin\"\n", "}\n", "\u001b[32;1m\u001b[1;3m[tool/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 4:tool:Python_REPL] Entering Tool run with input:\n", "\u001b[0m\"import pinyin\"\n", "\u001b[36;1m\u001b[1;3m[tool/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 4:tool:Python_REPL] [0.0ms] Exiting Tool run with output:\n", "\u001b[0m\"\"\n", "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 5:chain:LLMChain] Entering Chain run with input:\n", "\u001b[0m{\n", " \"input\": \"\u5728\u8fd9\u4e9b\u5ba2\u6237\u540d\u5b57\u8f6c\u6362\u4e3a\u62fc\u97f3\u5e76\u6253\u5370\u8f93\u51fa\u5217\u8868\uff1a ['\u5c0f\u660e', '\u5c0f\u9ec4', '\u5c0f\u7ea2', '\u5c0f\u84dd', '\u5c0f\u6a58', '\u5c0f\u7eff']\u601d\u8003\u8fc7\u7a0b\u8bf7\u4f7f\u7528\u4e2d\u6587\u3002\",\n", " \"agent_scratchpad\": \"\u6211\u9700\u8981\u4f7f\u7528\u62fc\u97f3\u8f6c\u6362\u5e93\u6765\u5c06\u8fd9\u4e9b\u5ba2\u6237\u540d\u5b57\u8f6c\u6362\u4e3a\u62fc\u97f3\u3002\u6211\u53ef\u4ee5\u4f7f\u7528Python\u7684pinyin\u5e93\u6765\u5b8c\u6210\u8fd9\u4e2a\u4efb\u52a1\u3002\\nAction: Python_REPL\\nAction Input: import pinyin\\nObservation: \\nThought:\",\n", " \"stop\": [\n", " \"\\nObservation:\",\n", " \"\\n\\tObservation:\"\n", " ]\n", "}\n", "\u001b[32;1m\u001b[1;3m[llm/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 5:chain:LLMChain > 6:llm:ChatOpenAI] Entering LLM run with input:\n", "\u001b[0m{\n", " \"prompts\": [\n", " \"Human: You are an agent designed to write and execute python code to answer questions.\\nYou have access to a python REPL, which you can use to execute python code.\\nIf you get an error, debug your code and try again.\\nOnly use the output of your code to answer the question. \\nYou might know the answer without running any code, but you should still run the code to get the answer.\\nIf it does not seem like you can write code to answer the question, just return \\\"I don't know\\\" as the answer.\\n\\n\\nPython_REPL: A Python shell. Use this to execute python commands. Input should be a valid python command. If you want to see the output of a value, you should print it out with `print(...)`.\\n\\nUse the following format:\\n\\nQuestion: the input question you must answer\\nThought: you should always think about what to do\\nAction: the action to take, should be one of [Python_REPL]\\nAction Input: the input to the action\\nObservation: the result of the action\\n... (this Thought/Action/Action Input/Observation can repeat N times)\\nThought: I now know the final answer\\nFinal Answer: the final answer to the original input question\\n\\nBegin!\\n\\nQuestion: \u5728\u8fd9\u4e9b\u5ba2\u6237\u540d\u5b57\u8f6c\u6362\u4e3a\u62fc\u97f3\u5e76\u6253\u5370\u8f93\u51fa\u5217\u8868\uff1a ['\u5c0f\u660e', '\u5c0f\u9ec4', '\u5c0f\u7ea2', '\u5c0f\u84dd', '\u5c0f\u6a58', '\u5c0f\u7eff']\u601d\u8003\u8fc7\u7a0b\u8bf7\u4f7f\u7528\u4e2d\u6587\u3002\\nThought:\u6211\u9700\u8981\u4f7f\u7528\u62fc\u97f3\u8f6c\u6362\u5e93\u6765\u5c06\u8fd9\u4e9b\u5ba2\u6237\u540d\u5b57\u8f6c\u6362\u4e3a\u62fc\u97f3\u3002\u6211\u53ef\u4ee5\u4f7f\u7528Python\u7684pinyin\u5e93\u6765\u5b8c\u6210\u8fd9\u4e2a\u4efb\u52a1\u3002\\nAction: Python_REPL\\nAction Input: import pinyin\\nObservation: \\nThought:\"\n", " ]\n", "}\n"]}, {"name": "stderr", "output_type": "stream", "text": ["Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 2.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n"]}, {"name": "stdout", "output_type": "stream", "text": ["\u001b[36;1m\u001b[1;3m[llm/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 5:chain:LLMChain > 6:llm:ChatOpenAI] [12.76s] Exiting LLM run with output:\n", "\u001b[0m{\n", " \"generations\": [\n", " [\n", " {\n", " \"text\": \"\u6211\u73b0\u5728\u53ef\u4ee5\u4f7f\u7528pinyin\u5e93\u6765\u5c06\u5ba2\u6237\u540d\u5b57\u8f6c\u6362\u4e3a\u62fc\u97f3\u3002\\nAction: Python_REPL\\nAction Input: names = ['\u5c0f\u660e', '\u5c0f\u9ec4', '\u5c0f\u7ea2', '\u5c0f\u84dd', '\u5c0f\u6a58', '\u5c0f\u7eff']\\npinyin_names = [pinyin.get(i, format='strip', delimiter='') for i in names]\\nprint(pinyin_names)\",\n", " \"generation_info\": null,\n", " \"message\": {\n", " \"content\": \"\u6211\u73b0\u5728\u53ef\u4ee5\u4f7f\u7528pinyin\u5e93\u6765\u5c06\u5ba2\u6237\u540d\u5b57\u8f6c\u6362\u4e3a\u62fc\u97f3\u3002\\nAction: Python_REPL\\nAction Input: names = ['\u5c0f\u660e', '\u5c0f\u9ec4', '\u5c0f\u7ea2', '\u5c0f\u84dd', '\u5c0f\u6a58', '\u5c0f\u7eff']\\npinyin_names = [pinyin.get(i, format='strip', delimiter='') for i in names]\\nprint(pinyin_names)\",\n", " \"additional_kwargs\": {},\n", " \"example\": false\n", " }\n", " }\n", " ]\n", " ],\n", " \"llm_output\": {\n", " \"token_usage\": {\n", " \"prompt_tokens\": 379,\n", " \"completion_tokens\": 89,\n", " \"total_tokens\": 468\n", " },\n", " \"model_name\": \"gpt-3.5-turbo\"\n", " },\n", " \"run\": null\n", "}\n", "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 5:chain:LLMChain] [12.77s] Exiting Chain run with output:\n", "\u001b[0m{\n", " \"text\": \"\u6211\u73b0\u5728\u53ef\u4ee5\u4f7f\u7528pinyin\u5e93\u6765\u5c06\u5ba2\u6237\u540d\u5b57\u8f6c\u6362\u4e3a\u62fc\u97f3\u3002\\nAction: Python_REPL\\nAction Input: names = ['\u5c0f\u660e', '\u5c0f\u9ec4', '\u5c0f\u7ea2', '\u5c0f\u84dd', '\u5c0f\u6a58', '\u5c0f\u7eff']\\npinyin_names = [pinyin.get(i, format='strip', delimiter='') for i in names]\\nprint(pinyin_names)\"\n", "}\n", "\u001b[32;1m\u001b[1;3m[tool/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 7:tool:Python_REPL] Entering Tool run with input:\n", "\u001b[0m\"names = ['\u5c0f\u660e', '\u5c0f\u9ec4', '\u5c0f\u7ea2', '\u5c0f\u84dd', '\u5c0f\u6a58', '\u5c0f\u7eff']\n", "pinyin_names = [pinyin.get(i, format='strip', delimiter='') for i in names]\n", "print(pinyin_names)\"\n", "\u001b[36;1m\u001b[1;3m[tool/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 7:tool:Python_REPL] [0.0ms] Exiting Tool run with output:\n", "\u001b[0m\"['xiaoming', 'xiaohuang', 'xiaohong', 'xiaolan', 'xiaoju', 'xiaolv']\"\n", "\u001b[32;1m\u001b[1;3m[chain/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 8:chain:LLMChain] Entering Chain run with input:\n", "\u001b[0m{\n", " \"input\": \"\u5728\u8fd9\u4e9b\u5ba2\u6237\u540d\u5b57\u8f6c\u6362\u4e3a\u62fc\u97f3\u5e76\u6253\u5370\u8f93\u51fa\u5217\u8868\uff1a ['\u5c0f\u660e', '\u5c0f\u9ec4', '\u5c0f\u7ea2', '\u5c0f\u84dd', '\u5c0f\u6a58', '\u5c0f\u7eff']\u601d\u8003\u8fc7\u7a0b\u8bf7\u4f7f\u7528\u4e2d\u6587\u3002\",\n", " \"agent_scratchpad\": \"\u6211\u9700\u8981\u4f7f\u7528\u62fc\u97f3\u8f6c\u6362\u5e93\u6765\u5c06\u8fd9\u4e9b\u5ba2\u6237\u540d\u5b57\u8f6c\u6362\u4e3a\u62fc\u97f3\u3002\u6211\u53ef\u4ee5\u4f7f\u7528Python\u7684pinyin\u5e93\u6765\u5b8c\u6210\u8fd9\u4e2a\u4efb\u52a1\u3002\\nAction: Python_REPL\\nAction Input: import pinyin\\nObservation: \\nThought:\u6211\u73b0\u5728\u53ef\u4ee5\u4f7f\u7528pinyin\u5e93\u6765\u5c06\u5ba2\u6237\u540d\u5b57\u8f6c\u6362\u4e3a\u62fc\u97f3\u3002\\nAction: Python_REPL\\nAction Input: names = ['\u5c0f\u660e', '\u5c0f\u9ec4', '\u5c0f\u7ea2', '\u5c0f\u84dd', '\u5c0f\u6a58', '\u5c0f\u7eff']\\npinyin_names = [pinyin.get(i, format='strip', delimiter='') for i in names]\\nprint(pinyin_names)\\nObservation: ['xiaoming', 'xiaohuang', 'xiaohong', 'xiaolan', 'xiaoju', 'xiaolv']\\n\\nThought:\",\n", " \"stop\": [\n", " \"\\nObservation:\",\n", " \"\\n\\tObservation:\"\n", " ]\n", "}\n", "\u001b[32;1m\u001b[1;3m[llm/start]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 8:chain:LLMChain > 9:llm:ChatOpenAI] Entering LLM run with input:\n", "\u001b[0m{\n", " \"prompts\": [\n", " \"Human: You are an agent designed to write and execute python code to answer questions.\\nYou have access to a python REPL, which you can use to execute python code.\\nIf you get an error, debug your code and try again.\\nOnly use the output of your code to answer the question. \\nYou might know the answer without running any code, but you should still run the code to get the answer.\\nIf it does not seem like you can write code to answer the question, just return \\\"I don't know\\\" as the answer.\\n\\n\\nPython_REPL: A Python shell. Use this to execute python commands. Input should be a valid python command. If you want to see the output of a value, you should print it out with `print(...)`.\\n\\nUse the following format:\\n\\nQuestion: the input question you must answer\\nThought: you should always think about what to do\\nAction: the action to take, should be one of [Python_REPL]\\nAction Input: the input to the action\\nObservation: the result of the action\\n... (this Thought/Action/Action Input/Observation can repeat N times)\\nThought: I now know the final answer\\nFinal Answer: the final answer to the original input question\\n\\nBegin!\\n\\nQuestion: \u5728\u8fd9\u4e9b\u5ba2\u6237\u540d\u5b57\u8f6c\u6362\u4e3a\u62fc\u97f3\u5e76\u6253\u5370\u8f93\u51fa\u5217\u8868\uff1a ['\u5c0f\u660e', '\u5c0f\u9ec4', '\u5c0f\u7ea2', '\u5c0f\u84dd', '\u5c0f\u6a58', '\u5c0f\u7eff']\u601d\u8003\u8fc7\u7a0b\u8bf7\u4f7f\u7528\u4e2d\u6587\u3002\\nThought:\u6211\u9700\u8981\u4f7f\u7528\u62fc\u97f3\u8f6c\u6362\u5e93\u6765\u5c06\u8fd9\u4e9b\u5ba2\u6237\u540d\u5b57\u8f6c\u6362\u4e3a\u62fc\u97f3\u3002\u6211\u53ef\u4ee5\u4f7f\u7528Python\u7684pinyin\u5e93\u6765\u5b8c\u6210\u8fd9\u4e2a\u4efb\u52a1\u3002\\nAction: Python_REPL\\nAction Input: import pinyin\\nObservation: \\nThought:\u6211\u73b0\u5728\u53ef\u4ee5\u4f7f\u7528pinyin\u5e93\u6765\u5c06\u5ba2\u6237\u540d\u5b57\u8f6c\u6362\u4e3a\u62fc\u97f3\u3002\\nAction: Python_REPL\\nAction Input: names = ['\u5c0f\u660e', '\u5c0f\u9ec4', '\u5c0f\u7ea2', '\u5c0f\u84dd', '\u5c0f\u6a58', '\u5c0f\u7eff']\\npinyin_names = [pinyin.get(i, format='strip', delimiter='') for i in names]\\nprint(pinyin_names)\\nObservation: ['xiaoming', 'xiaohuang', 'xiaohong', 'xiaolan', 'xiaoju', 'xiaolv']\\n\\nThought:\"\n", " ]\n", "}\n"]}, {"name": "stderr", "output_type": "stream", "text": ["Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 2.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n", "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 8.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-2YlJyPMl62f07XPJCAlXfDxj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..\n"]}, {"name": "stdout", "output_type": "stream", "text": ["\u001b[36;1m\u001b[1;3m[llm/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 8:chain:LLMChain > 9:llm:ChatOpenAI] [20.25s] Exiting LLM run with output:\n", "\u001b[0m{\n", " \"generations\": [\n", " [\n", " {\n", " \"text\": \"\u6211\u73b0\u5728\u77e5\u9053\u4e86\u6700\u7ec8\u7b54\u6848\u3002\\nFinal Answer: ['xiaoming', 'xiaohuang', 'xiaohong', 'xiaolan', 'xiaoju', 'xiaolv']\",\n", " \"generation_info\": null,\n", " \"message\": {\n", " \"content\": \"\u6211\u73b0\u5728\u77e5\u9053\u4e86\u6700\u7ec8\u7b54\u6848\u3002\\nFinal Answer: ['xiaoming', 'xiaohuang', 'xiaohong', 'xiaolan', 'xiaoju', 'xiaolv']\",\n", " \"additional_kwargs\": {},\n", " \"example\": false\n", " }\n", " }\n", " ]\n", " ],\n", " \"llm_output\": {\n", " \"token_usage\": {\n", " \"prompt_tokens\": 500,\n", " \"completion_tokens\": 43,\n", " \"total_tokens\": 543\n", " },\n", " \"model_name\": \"gpt-3.5-turbo\"\n", " },\n", " \"run\": null\n", "}\n", "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor > 8:chain:LLMChain] [20.25s] Exiting Chain run with output:\n", "\u001b[0m{\n", " \"text\": \"\u6211\u73b0\u5728\u77e5\u9053\u4e86\u6700\u7ec8\u7b54\u6848\u3002\\nFinal Answer: ['xiaoming', 'xiaohuang', 'xiaohong', 'xiaolan', 'xiaoju', 'xiaolv']\"\n", "}\n", "\u001b[36;1m\u001b[1;3m[chain/end]\u001b[0m \u001b[1m[1:chain:AgentExecutor] [35.93s] Exiting Chain run with output:\n", "\u001b[0m{\n", " \"output\": \"['xiaoming', 'xiaohuang', 'xiaohong', 'xiaolan', 'xiaoju', 'xiaolv']\"\n", "}\n"]}], "source": ["import langchain\n", "langchain.debug=True\n", "agent.run(f\"\"\"\u5728\u8fd9\u4e9b\u5ba2\u6237\u540d\u5b57\u8f6c\u6362\u4e3a\u62fc\u97f3\\\n", "\u5e76\u6253\u5370\u8f93\u51fa\u5217\u8868\uff1a {customer_list}\\\n", "\u601d\u8003\u8fc7\u7a0b\u8bf7\u4f7f\u7528\u4e2d\u6587\u3002\"\"\") \n", "langchain.debug=False"]}, {"cell_type": "markdown", "id": "6b2ba50d-53c4-4eab-9467-98f562a072cc", "metadata": {"tags": []}, "source": ["## \u4e09\u3001 \u5b9a\u4e49\u81ea\u5df1\u7684\u5de5\u5177\u5e76\u5728\u4ee3\u7406\u4e2d\u4f7f\u7528"]}, {"cell_type": "code", "execution_count": 26, "id": "d8e88bdf-01d2-4adc-b2af-086ece6ba97d", "metadata": {}, "outputs": [], "source": ["# \u5982\u679c\u4f60\u9700\u8981\u67e5\u770b\u5b89\u88c5\u8fc7\u7a0b\u65e5\u5fd7\uff0c\u53ef\u5220\u9664 -q \n", "!pip install -q DateTime"]}, {"cell_type": "markdown", "id": "fb92f6e4-21ab-494d-9c22-d0678050fd37", "metadata": {}, "source": ["### 3.1 \u521b\u5efa\u548c\u4f7f\u7528\u81ea\u5b9a\u4e49\u65f6\u95f4\u5de5\u5177"]}, {"cell_type": "code", "execution_count": 27, "id": "4b7e80e4-48d5-48f0-abeb-3dc33678ed9e", "metadata": {}, "outputs": [], "source": ["# \u5bfc\u5165tool\u51fd\u6570\u88c5\u9970\u5668\n", "from langchain.agents import tool\n", "from datetime import date"]}, {"cell_type": "markdown", "id": "8990b3c6-fe05-45b6-9567-06baec267a99", "metadata": {}, "source": ["#### 1\ufe0f\u20e3 \u4f7f\u7528tool\u51fd\u6570\u88c5\u9970\u5668\u6784\u5efa\u81ea\u5b9a\u4e49\u5de5\u5177\n", "tool\u51fd\u6570\u88c5\u9970\u5668\u53ef\u4ee5\u5e94\u7528\u7528\u4e8e\u4efb\u4f55\u51fd\u6570\uff0c\u5c06\u51fd\u6570\u8f6c\u5316\u4e3aLongChain\u5de5\u5177\uff0c\u4f7f\u5176\u6210\u4e3a\u4ee3\u7406\u53ef\u8c03\u7528\u7684\u5de5\u5177\u3002\n", "\n", "\u6211\u4eec\u9700\u8981\u7ed9\u51fd\u6570\u52a0\u4e0a\u975e\u5e38\u8be6\u7ec6\u7684\u6587\u6863\u5b57\u7b26\u4e32, \u4f7f\u5f97\u4ee3\u7406\u77e5\u9053\u5728\u4ec0\u4e48\u60c5\u51b5\u4e0b\u3001\u5982\u4f55\u4f7f\u7528\u8be5\u51fd\u6570/\u5de5\u5177\u3002\n", "\n", "\u6bd4\u5982\u4e0b\u9762\u7684\u51fd\u6570`time`,\u6211\u4eec\u52a0\u4e0a\u4e86\u8be6\u7ec6\u7684\u6587\u6863\u5b57\u7b26\u4e32\n", "\n", "```python\n", "\"\"\"\n", "\u8fd4\u56de\u4eca\u5929\u7684\u65e5\u671f\uff0c\u7528\u4e8e\u4efb\u4f55\u4e0e\u83b7\u53d6\u4eca\u5929\u65e5\u671f\u76f8\u5173\u7684\u95ee\u9898\u3002\n", "\u8f93\u5165\u5e94\u8be5\u59cb\u7ec8\u662f\u4e00\u4e2a\u7a7a\u5b57\u7b26\u4e32\uff0c\u8be5\u51fd\u6570\u5c06\u59cb\u7ec8\u8fd4\u56de\u4eca\u5929\u7684\u65e5\u671f\u3002\n", "\u4efb\u4f55\u65e5\u671f\u7684\u8ba1\u7b97\u5e94\u8be5\u5728\u6b64\u51fd\u6570\u4e4b\u5916\u8fdb\u884c\u3002\n", "\"\"\"\n", "\n", "```"]}, {"cell_type": "code", "execution_count": 28, "id": "e22a823f-3749-4bda-bac3-1b892da77929", "metadata": {}, "outputs": [], "source": ["@tool\n", "def time(text: str) -> str:\n", " \"\"\"\n", " \u8fd4\u56de\u4eca\u5929\u7684\u65e5\u671f\uff0c\u7528\u4e8e\u4efb\u4f55\u9700\u8981\u77e5\u9053\u4eca\u5929\u65e5\u671f\u7684\u95ee\u9898\u3002\\\n", " \u8f93\u5165\u5e94\u8be5\u603b\u662f\u4e00\u4e2a\u7a7a\u5b57\u7b26\u4e32\uff0c\\\n", " \u8fd9\u4e2a\u51fd\u6570\u5c06\u603b\u662f\u8fd4\u56de\u4eca\u5929\u7684\u65e5\u671f\uff0c\u4efb\u4f55\u65e5\u671f\u8ba1\u7b97\u5e94\u8be5\u5728\u8fd9\u4e2a\u51fd\u6570\u4e4b\u5916\u8fdb\u884c\u3002\n", " \"\"\"\n", " return str(date.today())"]}, {"cell_type": "markdown", "id": "440cb348-0ec6-4999-a6cd-81cb48b8c546", "metadata": {}, "source": ["#### 2\ufe0f\u20e3 \u521d\u59cb\u5316\u4ee3\u7406"]}, {"cell_type": "code", "execution_count": 36, "id": "09c9f59e-c14c-41b9-b6bd-0e97e8be72ab", "metadata": {}, "outputs": [], "source": ["agent= initialize_agent(\n", " tools=[time], #\u5c06\u521a\u521a\u521b\u5efa\u7684\u65f6\u95f4\u5de5\u5177\u52a0\u5165\u4ee3\u7406\n", " llm=llm, #\u521d\u59cb\u5316\u7684\u6a21\u578b\n", " agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION, #\u4ee3\u7406\u7c7b\u578b\n", " handle_parsing_errors=True, #\u5904\u7406\u89e3\u6790\u9519\u8bef\n", " verbose = True #\u8f93\u51fa\u4e2d\u95f4\u6b65\u9aa4\n", ")"]}, {"cell_type": "markdown", "id": "f1a10657-23f2-4f78-a247-73f272119eb4", "metadata": {}, "source": ["#### 3\ufe0f\u20e3 \u4f7f\u7528\u4ee3\u7406\u8be2\u95ee\u4eca\u5929\u7684\u65e5\u671f\n", "**\u6ce8**: \u4ee3\u7406\u6709\u65f6\u5019\u53ef\u80fd\u4f1a\u51fa\u9519\uff08\u8be5\u529f\u80fd\u6b63\u5728\u5f00\u53d1\u4e2d\uff09\u3002\u5982\u679c\u51fa\u73b0\u9519\u8bef\uff0c\u8bf7\u5c1d\u8bd5\u518d\u6b21\u8fd0\u884c\u5b83\u3002"]}, {"cell_type": "code", "execution_count": 37, "id": "f868e30b-ec67-4d76-9dc9-81ecd3332115", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["\n", "\n", "\u001b[1m> Entering new chain...\u001b[0m\n", "\u001b[32;1m\u001b[1;3m\u6839\u636e\u63d0\u4f9b\u7684\u5de5\u5177\uff0c\u6211\u4eec\u53ef\u4ee5\u4f7f\u7528`time`\u51fd\u6570\u6765\u83b7\u53d6\u4eca\u5929\u7684\u65e5\u671f\u3002\n", "\n", "Thought: \u4f7f\u7528`time`\u51fd\u6570\u6765\u83b7\u53d6\u4eca\u5929\u7684\u65e5\u671f\u3002\n", "\n", "Action:\n", "```\n", "{\n", " \"action\": \"time\",\n", " \"action_input\": \"\"\n", "}\n", "```\n", "\n", "\u001b[0m\n", "Observation: \u001b[36;1m\u001b[1;3m2023-07-04\u001b[0m\n", "Thought:\u001b[32;1m\u001b[1;3m\u6211\u73b0\u5728\u77e5\u9053\u4e86\u6700\u7ec8\u7b54\u6848\u3002\n", "Final Answer: \u4eca\u5929\u7684\u65e5\u671f\u662f2023-07-04\u3002\u001b[0m\n", "\n", "\u001b[1m> Finished chain.\u001b[0m\n"]}, {"data": {"text/plain": ["{'input': '\u4eca\u5929\u7684\u65e5\u671f\u662f\uff1f', 'output': '\u4eca\u5929\u7684\u65e5\u671f\u662f2023-07-04\u3002'}"]}, "execution_count": 37, "metadata": {}, "output_type": "execute_result"}], "source": ["agent(\"\u4eca\u5929\u7684\u65e5\u671f\u662f\uff1f\") "]}, {"cell_type": "markdown", "id": "923a199f-08e8-4e70-afb5-70cd3e123acf", "metadata": {}, "source": ["\u2705 **\u603b\u7ed3**\n", "\n", "1. \u6a21\u578b\u5bf9\u4e8e\u63a5\u4e0b\u6765\u9700\u8981\u505a\u4ec0\u4e48\uff0c\u7ed9\u51fa\u601d\u8003\uff08Thought\uff09 \n", " \u601d\u8003\uff1a\u6211\u9700\u8981\u4f7f\u7528 time \u5de5\u5177\u6765\u83b7\u53d6\u4eca\u5929\u7684\u65e5\u671f
\n", "\n", "2. \u6a21\u578b\u57fa\u4e8e\u601d\u8003\u91c7\u53d6\u884c\u52a8\uff08Action), \u56e0\u4e3a\u4f7f\u7528\u7684\u5de5\u5177\u4e0d\u540c\uff0cAction\u7684\u8f93\u51fa\u4e5f\u548c\u4e4b\u524d\u6709\u6240\u4e0d\u540c\uff0c\u8fd9\u91cc\u8f93\u51fa\u7684\u4e3apython\u4ee3\u7801\n", " \u884c\u52a8: \u4f7f\u7528time\u5de5\u5177\uff0c\u8f93\u5165\u4e3a\u7a7a\u5b57\u7b26\u4e32
\n", " \n", " \n", "3. \u6a21\u578b\u5f97\u5230\u89c2\u5bdf\uff08Observation\uff09\n", " \u89c2\u6d4b: 2023-07-04
\n", " \n", " \n", " \n", "4. \u57fa\u4e8e\u89c2\u5bdf\uff0c\u6a21\u578b\u5bf9\u4e8e\u63a5\u4e0b\u6765\u9700\u8981\u505a\u4ec0\u4e48\uff0c\u7ed9\u51fa\u601d\u8003\uff08Thought\uff09\n", " \u601d\u8003: \u6211\u5df2\u6210\u529f\u4f7f\u7528 time \u5de5\u5177\u68c0\u7d22\u5230\u4e86\u4eca\u5929\u7684\u65e5\u671f
\n", "\n", "5. \u7ed9\u51fa\u6700\u7ec8\u7b54\u6848\uff08Final Answer\uff09\n", " \u6700\u7ec8\u7b54\u6848: \u4eca\u5929\u7684\u65e5\u671f\u662f2023-07-04.
\n", "6. \u8fd4\u56de\u6700\u7ec8\u7b54\u6848\u3002"]}], "metadata": {"kernelspec": {"display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12"}, "toc": {"base_numbering": 1, "nav_menu": {}, "number_sections": false, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": true}}, "nbformat": 4, "nbformat_minor": 5}
\ No newline at end of file
diff --git "a/content/\345\277\205\344\277\256\344\270\211-LangChain for LLM Application Development/requirements.txt" "b/content/\345\277\205\344\277\256\344\270\211-LangChain for LLM Application Development/requirements.txt"
deleted file mode 100644
index 3455d43c..00000000
--- "a/content/\345\277\205\344\277\256\344\270\211-LangChain for LLM Application Development/requirements.txt"
+++ /dev/null
@@ -1,13 +0,0 @@
-DateTime==5.5
-docarray==0.40.0
-langchain==0.2.3
-langchain-community==0.2.4
-langchain_experimental==0.0.60
-numexpr==2.10.0
-openai==1.33.0
-pandas==2.2.2
-pinyin==0.4.0
-python-dotenv==1.0.1
-tiktoken==0.7.0
-wikipedia==1.4.0
-xpinyin==0.7.6
\ No newline at end of file
diff --git "a/content/\345\277\205\344\277\256\345\233\233-LangChain Chat with Your Data/1.\347\256\200\344\273\213 Introduction.md" "b/content/\345\277\205\344\277\256\345\233\233-LangChain Chat with Your Data/1.\347\256\200\344\273\213 Introduction.md"
index 8e7272a4..39119e53 100644
--- "a/content/\345\277\205\344\277\256\345\233\233-LangChain Chat with Your Data/1.\347\256\200\344\273\213 Introduction.md"
+++ "b/content/\345\277\205\344\277\256\345\233\233-LangChain Chat with Your Data/1.\347\256\200\344\273\213 Introduction.md"
@@ -1,6 +1,6 @@
# 第一章 简介
-本课程由哈里森·蔡斯(Harrison Chase,LangChain作者)与Deeplearning.ai合作开发,课程将介绍如何使用LangChain和自有数据进行对话。
+本课程由哈里森·蔡斯 (Harrison Chase,LangChain作者)与Deeplearning.ai合作开发,课程将介绍如何使用LangChain和自有数据进行对话。
## 一、背景
diff --git "a/content/\345\277\205\344\277\256\345\233\233-LangChain Chat with Your Data/2.\346\226\207\346\241\243\345\212\240\350\275\275 Document Loading.ipynb" "b/content/\345\277\205\344\277\256\345\233\233-LangChain Chat with Your Data/2.\346\226\207\346\241\243\345\212\240\350\275\275 Document Loading.ipynb"
index 776f0d5a..f6d620a9 100644
--- "a/content/\345\277\205\344\277\256\345\233\233-LangChain Chat with Your Data/2.\346\226\207\346\241\243\345\212\240\350\275\275 Document Loading.ipynb"
+++ "b/content/\345\277\205\344\277\256\345\233\233-LangChain Chat with Your Data/2.\346\226\207\346\241\243\345\212\240\350\275\275 Document Loading.ipynb"
@@ -1 +1 @@
-{"cells":[{"cell_type":"markdown","id":"cc2eb3ad-8c1c-406a-b7aa-a3f61b754ac5","metadata":{},"source":["# 第二章 文档加载\n","\n"," - [一、PDF文档](#一、PDF文档)\n"," - [1.1 安装相关包 ](#1.1-安装相关包-)\n"," - [1.2 加载PDF文档](#1.2-加载PDF文档)\n"," - [1.3 探索加载的数据](#1.3-探索加载的数据)\n"," - [二、网页文档](#二、网页文档)\n"," - [2.1 加载网页文档](#2.1-加载网页文档)\n"," - [2.2 探索加载的数据](#2.2-探索加载的数据)\n"," - [三、Notion文档](#三、Notion文档)\n"," - [3.1 加载Notion Markdown文档](#3.1-加载Notion-Markdown文档)\n"," - [3.2 探索加载的数据](#3.2-探索加载的数据)\n"]},{"cell_type":"markdown","id":"89e612a5","metadata":{},"source":["同前序课程一样,我们同样需要先配置环境以访问 OpenAI 提供的 API,配置方法如下,详细介绍请参见其他课程"]},{"cell_type":"code","execution_count":1,"id":"bb73a77f-e17c-45a2-b456-e3ad2bf0fb5c","metadata":{},"outputs":[],"source":["import os\n","import openai\n","\n","from dotenv import load_dotenv, find_dotenv\n","_ = load_dotenv(find_dotenv()) \n","\n","openai.api_key = os.environ['OPENAI_API_KEY']\n","openai.base_url = os.environ['OPENAI_API_BASE']"]},{"cell_type":"markdown","id":"63558db2-5279-4c1b-9bec-355ab04731e6","metadata":{},"source":["## 一、PDF文档\n","\n","首先,我们来加载一个[PDF文档](https://see.stanford.edu/materials/aimlcs229/transcripts/MachineLearning-Lecture01.pdf)。该文档为吴恩达教授的2009年机器学习课程的字幕文件。因为这些字幕为自动生成,所以词句直接可能不太连贯和通畅。"]},{"cell_type":"markdown","id":"dd5fe85c-6aae-4739-9b47-68e791afc9ac","metadata":{},"source":["### 1.1 安装相关包 "]},{"cell_type":"markdown","id":"8dcb2102-0414-4130-952b-3b6fa33b61bb","metadata":{},"source":["### 1.2 加载PDF文档"]},{"cell_type":"code","execution_count":2,"id":"52d9891f-a8cc-47c4-8c09-81794647a720","metadata":{},"outputs":[],"source":["from langchain_community.document_loaders.pdf import PyPDFLoader\n","\n","# 创建一个 PyPDFLoader Class 实例,输入为待加载的pdf文档路径\n","loader = PyPDFLoader(\"docs/cs229_lectures/MachineLearning-Lecture01.pdf\")\n","\n","# 调用 PyPDFLoader Class 的函数 load对pdf文件进行加载\n","pages = loader.load()"]},{"cell_type":"markdown","id":"68d40600-49ab-42a3-97d0-b9a2c4ab8139","metadata":{},"source":["### 1.3 探索加载的数据"]},{"cell_type":"markdown","id":"feca9f1e-1596-49f2-a6d9-6eeaeffbd90b","metadata":{},"source":["文档加载后储存在`pages`变量中:\n","- `page`的变量类型为`List`\n","- 打印 `pages` 的长度可以看到pdf一共包含多少页"]},{"cell_type":"code","execution_count":3,"id":"9463b982-c71c-4241-b3a3-b040170eef2e","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["\n"]}],"source":["print(type(pages))"]},{"cell_type":"code","execution_count":4,"id":"67a2b815-586f-43a5-96a4-cfe46001a766","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["22\n"]}],"source":["print(len(pages))"]},{"cell_type":"markdown","id":"2cde6b9d-71c8-4851-a8f6-a3f0e76f6dab","metadata":{},"source":["`page`中的每一元素为一个文档,变量类型为`langchain.schema.Document`, 文档变量类型包含两个属性\n","- `page_content` 包含该文档的内容。\n","- `meta_data` 为文档相关的描述性数据。"]},{"cell_type":"code","execution_count":5,"id":"921827ae-3a5b-4f29-b015-a5dde3be1410","metadata":{},"outputs":[],"source":["page = pages[0]"]},{"cell_type":"code","execution_count":6,"id":"aadaa840-0f30-4ae3-b06b-7fe8f468d146","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["\n"]}],"source":["print(type(page))"]},{"cell_type":"code","execution_count":7,"id":"85777ce2-42c7-4e11-b1ba-06fd6a0d8502","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["MachineLearning-Lecture01 \n","Instructor (Andrew Ng): Okay. Good morning. Welcome to CS229, the machine \n","learning class. So what I wanna do today is ju st spend a little time going over the logistics \n","of the class, and then we'll start to talk a bit about machine learning. \n","By way of introduction, my name's Andrew Ng and I'll be instru ctor for this class. And so \n","I personally work in machine learning, and I' ve worked on it for about 15 years now, and \n","I actually think that machine learning i\n"]}],"source":["print(page.page_content[0:500])"]},{"cell_type":"code","execution_count":8,"id":"8a1f8acd-f8c7-46af-a29f-df172067deba","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["{'source': 'docs/cs229_lectures/MachineLearning-Lecture01.pdf', 'page': 0}\n"]}],"source":["print(page.metadata)"]},{"cell_type":"markdown","id":"5b7ddc7d-2d40-4811-8cb3-5e73344ebe24","metadata":{},"source":["## 二、网页文档\n","\n","本部分,对于给定网页文档链接(URLs),我们学习如何对其进行加载。这里我们对Github上的网页文档进行加载,该文档格式为markdown。"]},{"cell_type":"markdown","id":"b28abf4d-4907-47f6-b54d-6d322a5794e6","metadata":{},"source":["### 2.1 加载网页文档"]},{"cell_type":"code","execution_count":9,"id":"1a68375f-44ae-4905-bf9c-1f01ec800481","metadata":{},"outputs":[{"name":"stderr","output_type":"stream","text":["WARNING:root:USER_AGENT environment variable not set, consider setting it to identify your requests.\n"]}],"source":["from langchain.document_loaders import WebBaseLoader\n","\n","\n","# 创建一个 WebBaseLoader Class 实例\n","url = \"https://github.com/datawhalechina/llm-cookbook/blob/main/README.md\"\n","header = {'User-Agent': 'python-requests/2.27.1', \n"," 'Accept-Encoding': 'gzip, deflate, br', \n"," 'Accept': '*/*',\n"," 'Connection': 'keep-alive'}\n","loader = WebBaseLoader(web_path=url,header_template=header)\n","\n","# 调用 WebBaseLoader Class 的函数 load对文件进行加载\n","docs = loader.load()"]},{"cell_type":"markdown","id":"fc24f44a-01f5-49a3-9529-2f05c1053b2c","metadata":{"tags":[]},"source":["### 2.2 探索加载的数据"]},{"cell_type":"markdown","id":"f2f108b9-713b-4b98-b44d-4dfc3dcbcde2","metadata":{},"source":["文档加载后储存在`docs`变量中:\n","- `docs`的变量类型为`List`\n","- 打印 `docs` 的长度可以看到一共包含多少页"]},{"cell_type":"code","execution_count":10,"id":"8c8670fa-203b-4c35-9266-f976f50f0f5d","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["\n"]}],"source":["print(type(docs))"]},{"cell_type":"code","execution_count":11,"id":"0e85a526-55e7-4186-8697-d16a891bcabc","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["1\n"]}],"source":["print(len(docs))"]},{"cell_type":"markdown","id":"b4397791-4d3c-4609-86be-3cda90a3f2fc","metadata":{},"source":["`docs`中的每一元素为一个文档,变量类型为`langchain.schema.document.Document`, 文档变量类型包含两个属性\n","- `page_content` 包含该文档的内容。\n","- `meta_data` 为文档相关的描述性数据。"]},{"cell_type":"code","execution_count":12,"id":"26423c51-6503-478c-b17d-bbe57049a04c","metadata":{},"outputs":[],"source":["doc = docs[0]"]},{"cell_type":"code","execution_count":13,"id":"a243638f-0c23-46b8-8854-13f1f1de6f0a","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["\n"]}],"source":["print(type(doc))"]},{"cell_type":"code","execution_count":14,"id":"9f237541-79b7-4ae4-9e0e-27a28af99b7a","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","llm-cookbook/README.md at main · datawhalechina/llm-cookbook · GitHub\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","Skip to content\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","Navigation Menu\n","\n","Toggle navigation\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n"," Sign in\n"," \n","\n","\n"," \n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n"," Product\n"," \n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","Actions\n"," Automate any workflow\n"," \n","\n","\n","\n","\n","\n","\n","\n","Packages\n"," Host and manage packages\n"," \n","\n","\n","\n","\n","\n","\n","\n","Security\n"," Find and fix vulnerabilities\n"," \n","\n","\n","\n","\n","\n","\n","\n","Codespaces\n"," Instant dev environments\n"," \n","\n","\n","\n","\n","\n","\n","\n","GitHub Copilot\n"," Write better code with AI\n"," \n","\n","\n","\n","\n","\n","\n","\n","Code review\n"," Manage code changes\n"," \n","\n","\n","\n","\n","\n","\n","\n","Issues\n"," Plan and track work\n"," \n","\n","\n","\n","\n","\n","\n","\n","Discussions\n"," Collaborate outside of code\n"," \n","\n","\n","\n","\n","Explore\n","\n","\n","\n"," All features\n","\n"," \n","\n","\n","\n"," Documentation\n","\n"," \n","\n","\n","\n","\n","\n"," GitHub Skills\n","\n"," \n","\n","\n","\n","\n","\n"," Blog\n","\n"," \n","\n","\n","\n","\n","\n","\n","\n","\n","\n"," Solutions\n"," \n","\n","\n","\n","\n","\n","For\n","\n","\n","\n"," Enterprise\n","\n"," \n","\n","\n","\n"," Teams\n","\n"," \n","\n","\n","\n"," Startups\n","\n"," \n","\n","\n","\n"," Education\n","\n"," \n","\n","\n","\n","\n","\n","\n","By Solution\n","\n","\n","\n"," CI/CD & Automation\n","\n"," \n","\n","\n","\n"," DevOps\n","\n"," \n","\n","\n","\n"," DevSecOps\n","\n"," \n","\n","\n","\n","\n","Resources\n","\n","\n","\n"," Learning Pathways\n","\n"," \n","\n","\n","\n","\n","\n"," White papers, Ebooks, Webinars\n","\n"," \n","\n","\n","\n","\n","\n"," Customer Stories\n","\n"," \n","\n","\n","\n"," Partners\n","\n"," \n","\n","\n","\n","\n","\n","\n","\n","\n","\n"," Open Source\n"," \n","\n","\n","\n","\n","\n","\n","\n","\n","\n","GitHub Sponsors\n"," Fund open source developers\n"," \n","\n","\n","\n","\n","\n","\n","\n","\n","The ReadME Project\n"," GitHub community articles\n"," \n","\n","\n","\n","\n","Repositories\n","\n","\n","\n"," Topics\n","\n"," \n","\n","\n","\n"," Trending\n","\n"," \n","\n","\n","\n"," Collections\n","\n"," \n","\n","\n","\n","\n","\n","\n","\n"," Enterprise\n"," \n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","Enterprise platform\n"," AI-powered developer platform\n"," \n","\n","\n","\n","\n","Available add-ons\n","\n","\n","\n","\n","\n","\n","\n","Advanced Security\n"," Enterprise-grade security features\n"," \n","\n","\n","\n","\n","\n","\n","\n","GitHub Copilot\n"," Enterprise-grade AI features\n"," \n","\n","\n","\n","\n","\n","\n","\n","Premium Support\n"," Enterprise-grade 24/7 support\n"," \n","\n","\n","\n","\n","\n","\n","Pricing\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","Search or jump to...\n","\n","\n","\n","\n","\n","\n","\n","Search code, repositories, users, issues, pull requests...\n","\n"," \n","\n","\n","\n","\n"," Search\n"," \n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","Clear\n"," \n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n"," \n","\n","\n","\n","\n"," Search syntax tips\n"," \n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n"," Provide feedback\n"," \n","\n","\n","\n","\n","\n","\n","\n","\n","\n"," \n","We read every piece of feedback, and take your input very seriously.\n","\n","\n","Include my email address so I can be contacted\n","\n","\n"," Cancel\n","\n"," Submit feedback\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n"," Saved searches\n"," \n","Use saved searches to filter your results more quickly\n","\n","\n","\n","\n","\n","\n","\n","\n","\n"," \n","\n","\n","\n","\n","\n","Name\n","\n","\n","\n","\n","\n","\n","Query\n","\n","\n","\n"," To see all available qualifiers, see our documentation.\n"," \n"," \n","\n","\n","\n","\n","\n"," Cancel\n","\n"," Create saved search\n","\n","\n","\n","\n","\n","\n","\n","\n"," Sign in\n"," \n","\n","\n"," Sign up\n"," \n","\n","\n","\n","\n","\n","\n","\n","\n","\n","You signed in with another tab or window. Reload to refresh your session.\n","You signed out in another tab or window. Reload to refresh your session.\n","You switched accounts on another tab or window. Reload to refresh your session.\n"," \n","\n","\n","Dismiss alert\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n"," datawhalechina\n"," \n","/\n","\n","llm-cookbook\n","\n","Public\n","\n","\n","\n","\n","\n"," \n","\n","Notifications\n"," You must be signed in to change notification settings\n","\n","\n"," \n","\n","Fork\n"," 1.2k\n","\n","\n","\n","\n"," \n","\n","\n"," Star\n"," 10.2k\n"," \n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","Code\n","\n","\n","\n","\n","\n","\n","\n","Issues\n","0\n","\n","\n","\n","\n","\n","\n","Pull requests\n","2\n","\n","\n","\n","\n","\n","\n","Actions\n","\n","\n","\n","\n","\n","\n","\n","Projects\n","0\n","\n","\n","\n","\n","\n","\n","Security\n","\n","\n","\n","\n","\n","\n","\n","Insights\n","\n","\n","\n"," \n","\n"," \n","\n","\n","Additional navigation options\n","\n","\n"," \n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n"," Code\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n"," Issues\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n"," Pull requests\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n"," Actions\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n"," Projects\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n"," Security\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n"," Insights\n","\n","\n","\n","\n","\n"," \n","\n","\n","\n","\n","\n","\n","\n"," Files mainBreadcrumbsllm-cookbook/README.md Blame Blame Latest commit HistoryHistory143 lines (94 loc) · 13.1 KB mainBreadcrumbsllm-cookbook/README.mdTopFile metadata and controlsPreviewCodeBlame143 lines (94 loc) · 13.1 KBRaw\n","面向开发者的大模型手册 - LLM Cookbook\n","项目简介\n","本项目是一个面向开发者的大模型手册,针对国内开发者的实际需求,主打 LLM 全方位入门实践。本项目基于吴恩达老师大模型系列课程内容,对原课程内容进行筛选、翻译、复现和调优,覆盖从 Prompt Engineering 到 RAG 开发、模型微调的全部流程,用最适合国内学习者的方式,指导国内开发者如何学习、入门 LLM 相关项目。\n","针对不同内容的特点,我们对共计 11 门吴恩达老师的大模型课程进行了翻译复现,并结合国内学习者的实际情况,对不同课程进行了分级和排序,初学者可以先系统学习我们的必修类课程,掌握入门 LLM 所有方向都需要掌握的基础技能和概念,再选择性地学习我们的选修类课程,在自己感兴趣的方向上不断探索和学习。\n","如果有你非常喜欢但我们还没有进行复现的吴恩达老师大模型课程,我们欢迎每一位开发者参考我们已有课程的格式和写法来对课程进行复现并提交 PR,在 PR 审核通过后,我们会根据课程内容将课程进行分级合并。欢迎每一位开发者的贡献!\n","在线阅读地址:面向开发者的 LLM 入门课程-在线阅读\n","PDF下载地址:面向开发者的 LLM 入门教程-PDF\n","英文原版地址:吴恩达关于大模型的系列课程\n","项目意义\n","LLM 正在逐步改变人们的生活,而对于开发者,如何基于 LLM 提供的 API 快速、便捷地开发一些具备更强能力、集成LLM 的应用,来便捷地实现一些更新颖、更实用的能力,是一个急需学习的重要能力。\n","由吴恩达老师与 OpenAI 合作推出的大模型系列教程,从大模型时代开发者的基础技能出发,深入浅出地介绍了如何基于大模型 API、LangChain 架构快速开发结合大模型强大能力的应用。其中,《Prompt Engineering for Developers》教程面向入门 LLM 的开发者,深入浅出地介绍了对于开发者,如何构造 Prompt 并基于 OpenAI 提供的 API 实现包括总结、推断、转换等多种常用功能,是入门 LLM 开发的经典教程;《Building Systems with the ChatGPT API》教程面向想要基于 LLM 开发应用程序的开发者,简洁有效而又系统全面地介绍了如何基于 ChatGPT API 打造完整的对话系统;《LangChain for LLM Application Development》教程结合经典大模型开源框架 LangChain,介绍了如何基于 LangChain 框架开发具备实用功能、能力全面的应用程序,《LangChain Chat With Your Data》教程则在此基础上进一步介绍了如何使用 LangChain 架构结合个人私有数据开发个性化大模型应用;《Building Generative AI Applications with Gradio》、《Evaluating and Debugging Generative AI》教程分别介绍了两个实用工具 Gradio 与 W&B,指导开发者如何结合这两个工具来打造、评估生成式 AI 应用。\n","上述教程非常适用于开发者学习以开启基于 LLM 实际搭建应用程序之路。因此,我们将该系列课程翻译为中文,并复现其范例代码,也为其中一个视频增加了中文字幕,支持国内中文学习者直接使用,以帮助中文学习者更好地学习 LLM 开发;我们也同时实现了效果大致相当的中文 Prompt,支持学习者感受中文语境下 LLM 的学习使用,对比掌握多语言语境下的 Prompt 设计与 LLM 开发。未来,我们也将加入更多 Prompt 高级技巧,以丰富本课程内容,帮助开发者掌握更多、更巧妙的 Prompt 技能。\n","项目受众\n","所有具备基础 Python 能力,想要入门 LLM 的开发者。\n","项目亮点\n","《ChatGPT Prompt Engineering for Developers》、《Building Systems with the ChatGPT API》等教程作为由吴恩达老师与 OpenAI 联合推出的官方教程,在可预见的未来会成为 LLM 的重要入门教程,但是目前还只支持英文版且国内访问受限,打造中文版且国内流畅访问的教程具有重要意义;同时,GPT 对中文、英文具有不同的理解能力,本教程在多次对比、实验之后确定了效果大致相当的中文 Prompt,支持学习者研究如何提升 ChatGPT 在中文语境下的理解与生成能力。\n","学习指南\n","本教程适用于所有具备基础 Python 能力,想要入门 LLM 的开发者。\n","如果你想要开始学习本教程,你需要提前具备:\n","\n","至少一个 LLM API(最好是 OpenAI,如果是其他 API,你可能需要参考其他教程对 API 调用代码进行修改)\n","能够使用 Python Jupyter Notebook\n","\n","本教程共包括 11 门课程,分为必修类、选修类两个类别。必修类课程是我们认为最适合初学者学习以入门 LLM 的课程,包括了入门 LLM 所有方向都需要掌握的基础技能和概念,我们也针对必修类课程制作了适合阅读的在线阅读和 PDF 版本,在学习必修类课程时,我们建议学习者按照我们列出的顺序进行学习;选修类课程是在必修类课程上的拓展延伸,包括了 RAG 开发、模型微调、模型评估等多个方面,适合学习者在掌握了必修类课程之后选择自己感兴趣的方向和课程进行学习。\n","必修类课程包括:\n","\n","面向开发者的 Prompt Engineering。基于吴恩达老师《ChatGPT Prompt Engineering for Developers》课程打造,面向入门 LLM 的开发者,深入浅出地介绍了对于开发者,如何构造 Prompt 并基于 OpenAI 提供的 API 实现包括总结、推断、转换等多种常用功能,是入门 LLM 开发的第一步。\n","搭建基于 ChatGPT 的问答系统。基于吴恩达老师《Building Systems with the ChatGPT API》课程打造,指导开发者如何基于 ChatGPT 提供的 API 开发一个完整的、全面的智能问答系统。通过代码实践,实现了基于 ChatGPT 开发问答系统的全流程,介绍了基于大模型开发的新范式,是大模型开发的实践基础。\n","使用 LangChain 开发应用程序。基于吴恩达老师《LangChain for LLM Application Development》课程打造,对 LangChain 展开深入介绍,帮助学习者了解如何使用 LangChain,并基于 LangChain 开发完整的、具备强大能力的应用程序。\n","使用 LangChain 访问个人数据。基于吴恩达老师《LangChain Chat with Your Data》课程打造,深入拓展 LangChain 提供的个人数据访问能力,指导开发者如何使用 LangChain 开发能够访问用户个人数据、提供个性化服务的大模型应用。\n","\n","选修类课程包括:\n","\n","使用 Gradio 搭建生成式 AI 应用。基于吴恩达老师《Building Generative AI Applications with Gradio》课程打造,指导开发者如何使用 Gradio 通过 Python 接口程序快速、高效地为生成式 AI 构建用户界面。\n","评估改进生成式 AI。基于吴恩达老师《Evaluating and Debugging Generative AI》课程打造,结合 wandb,提供一套系统化的方法和工具,帮助开发者有效地跟踪和调试生成式 AI 模型。\n","微调大语言模型。基于吴恩达老师《Finetuning Large Language Model》课程打造,结合 lamini 框架,讲述如何便捷高效地在本地基于个人数据微调开源大语言模型。\n","大模型与语义检索。基于吴恩达老师《Large Language Models with Semantic Search》课程打造,针对检索增强生成,讲述了多种高级检索技巧以实现更准确、高效的检索增强 LLM 生成效果。\n","基于 Chroma 的高级检索。基于吴恩达老师《Advanced Retrieval for AI with Chroma》课程打造,旨在介绍基于 Chroma 的高级检索技术,提升检索结果的准确性。\n","搭建和评估高级 RAG 应用。基于吴恩达老师《Building and Evaluating Advanced RAG Applications》课程打造,介绍构建和实现高质量RAG系统所需的关键技术和评估框架。\n","LangChain 的 Functions、Tools 和 Agents。基于吴恩达老师《Functions, Tools and Agents with LangChain》课程打造,介绍如何基于 LangChain 的新语法构建 Agent。\n","Prompt 高级技巧。包括 CoT、自我一致性等多种 Prompt 高级技巧的基础理论与代码实现。\n","\n","其他资料包括:\n","双语字幕视频地址:吴恩达 x OpenAI的Prompt Engineering课程专业翻译版\n","中英双语字幕下载:《ChatGPT提示工程》非官方版中英双语字幕\n","视频讲解:面向开发者的 Prompt Engineering 讲解(数字游民大会)\n","目录结构说明:\n","content:基于原课程复现的双语版代码,可运行的 Notebook,更新频率最高,更新速度最快。\n","\n","docs:必修类课程文字教程版在线阅读源码,适合阅读的 md。\n","\n","figures:图片文件。\n","\n","致谢\n","核心贡献者\n","\n","邹雨衡-项目负责人(Datawhale成员-对外经济贸易大学研究生)\n","长琴-项目发起人(内容创作者-Datawhale成员-AI算法工程师)\n","玉琳-项目发起人(内容创作者-Datawhale成员)\n","徐虎-教程编撰者(内容创作者-Datawhale成员)\n","刘伟鸿-教程编撰者(内容创作者-江南大学非全研究生)\n","Joye-教程编撰者(内容创作者-数据科学家)\n","高立业(内容创作者-DataWhale成员-算法工程师)\n","邓宇文(内容创作者-Datawhale成员)\n","魂兮(内容创作者-前端工程师)\n","宋志学(内容创作者-Datawhale成员)\n","韩颐堃(内容创作者-Datawhale成员)\n","陈逸涵 (内容创作者-Datawhale意向成员-AI爱好者)\n","仲泰(内容创作者-Datawhale成员)\n","万礼行(内容创作者-视频翻译者)\n","王熠明(内容创作者-Datawhale成员)\n","曾浩龙(内容创作者-Datawhale 意向成员-JLU AI 研究生)\n","小饭同学(内容创作者)\n","孙韩玉(内容创作者-算法量化部署工程师)\n","张银晗(内容创作者-Datawhale成员)\n","左春生(内容创作者-Datawhale成员)\n","张晋(内容创作者-Datawhale成员)\n","李娇娇(内容创作者-Datawhale成员)\n","邓恺俊(内容创作者-Datawhale成员)\n","范致远(内容创作者-Datawhale成员)\n","周景林(内容创作者-Datawhale成员)\n","诸世纪(内容创作者-算法工程师)\n","Zhang Yixin(内容创作者-IT爱好者)\n","Sarai(内容创作者-AI应用爱好者)\n","\n","其他\n","\n","特别感谢 @Sm1les、@LSGOMYP 对本项目的帮助与支持;\n","感谢 GithubDaily 提供的双语字幕;\n","如果有任何想法可以联系我们 DataWhale 也欢迎大家多多提出 issue;\n","特别感谢以下为教程做出贡献的同学!\n","\n","\n","\n","\n","Made with contrib.rocks.\n","Star History\n","\n","关注我们\n","\n","扫描下方二维码关注公众号:Datawhale\n","\n","\n","Datawhale 是一个专注于数据科学与 AI 领域的开源组织,汇集了众多领域院校和知名企业的优秀学习者,聚合了一群有开源精神和探索精神的团队成员。微信搜索公众号Datawhale可以加入我们。\n","LICENSE\n","本作品采用知识共享署名-非商业性使用-相同方式共享 4.0 国际许可协议进行许可。\n"," \n","\n","\n","\n","\n","\n","\n","\n","\n","Footer\n","\n","\n","\n","\n","\n","\n","\n","\n"," © 2024 GitHub, Inc.\n"," \n","\n","\n","Footer navigation\n","\n","\n","Terms\n","\n","\n","Privacy\n","\n","\n","Security\n","\n","\n","Status\n","\n","\n","Docs\n","\n","\n","Contact\n","\n","\n","\n","\n"," Manage cookies\n"," \n","\n","\n","\n","\n","\n"," Do not share my personal information\n"," \n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n"," You can’t perform that action at this time.\n"," \n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n","\n"]}],"source":["print(doc.page_content)"]},{"cell_type":"markdown","id":"52025103-205e-4137-a116-89f37fcfece1","metadata":{},"source":["langchain现在的版本会自动提取网页中的内容,但上面的文档内容仍包含许多冗余的信息。我们需要进行对这种数据进行进一步处理(Post Processing)。"]},{"cell_type":"code","execution_count":15,"id":"a7c5281f-aeed-4ee7-849b-bbf9fd3e35c7","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["面向开发者的大模型手册 - LLM Cookbook\n","项目简介\n","本项目是一个面向开发者的大模型手册,针对国内开发者的实际需求,主打 LLM 全方位入门实践。本项目基于吴恩达老师大模型系列课程内容,对原课程内容进行筛选、翻译、复现和调优,覆盖从 Prompt Engineering 到 RAG 开发、模型微调的全部流程,用最适合国内学习者的方式,指导国内开发者如何学习、入门 LLM 相关项目。\n","针对不同内容的特点,我们对共计 11 门吴恩达老师的大模型课程进行了翻译复现,并结合国内学习者的实际情况,对不同课程进行了分级和排序,初学者可以先系统学习我们的必修类课程,掌握入门 LLM 所有方向都需要掌握的基础技能和概念,再选择性地学习我们的选修类课程,在自己感兴趣的方向上不断探索和学习。\n","如果有你非常喜欢但我们还没有进行复现的吴恩达老师大模型课程,我们欢迎每一位开发者参考我们已有课程的格式和写法来对课程进行复现并提交 PR,在 PR 审核通过后,我们会根据课程内容将课程进行分级合并。欢迎每一位开发者的贡献!\n","在线阅读地址:面向开发者的 LLM 入门课程-在线阅读\n","PDF下载地址:面向开发者的 LLM 入门教程-PDF\n","英文原版地址:吴恩达关于大模型的系列课程\n","项目意义\n","LLM 正在逐步改变人们的生活,而对于开发者,如何基于 LLM 提供的 API 快速、便捷地开发一些具备更强能力、集成LLM 的应用,来便捷地实现一些更新颖、更实用的能力,是一个急需学习的重要能力。\n","由吴恩达老师与 OpenAI 合作推出的大模型系列教程,从大模型时代开发者的基础技能出发,深入浅出地介绍了如何基于大模型 API、LangChain 架构快速开发结合大模型强大能力的应用。其中,《Prompt Engineering for Developers》教程面向入门 LLM 的开发者,深入浅出地介绍了对于开发者,如何构造 Prompt 并基于 OpenAI 提供的 API 实现包括总结、推断、转换等多种常用功能,是入门 LLM 开发的经典教程;《Building Systems with the ChatGPT API》教程面向想要基于 LLM 开发应用程序的开发者,简洁有效而又系统全面地介绍了如何基于 ChatGPT API 打造完整的对话系统;《LangChain for LLM Application Development》教程结合经典大模型开源框架 LangChain,介绍了如何基于 LangChain 框架开发具备实用功能、能力全面的应用程序,《LangChain Chat With Your Data》教程则在此基础上进一步介绍了如何使用 LangChain 架构结合个人私有数据开发个性化大模型应用;《Building Generative AI Applications with Gradio》、《Evaluating and Debugging Generative AI》教程分别介绍了两个实用工具 Gradio 与 W&B,指导开发者如何结合这两个工具来打造、评估生成式 AI 应用。\n","上述教程非常适用于开发者学习以开启基于 LLM 实际搭建应用程序之路。因此,我们将该系列课程翻译为中文,并复现其范例代码,也为其中一个视频增加了中文字幕,支持国内中文学习者直接使用,以帮助中文学习者更好地学习 LLM 开发;我们也同时实现了效果大致相当的中文 Prompt,支持学习者感受中文语境下 LLM 的学习使用,对比掌握多语言语境下的 Prompt 设计与 LLM 开发。未来,我们也将加入更多 Prompt 高级技巧,以丰富本课程内容,帮助开发者掌握更多、更巧妙的 Prompt 技能。\n","项目受众\n","所有具备基础 Python 能力,想要入门 LLM 的开发者。\n","项目亮点\n","《ChatGPT Prompt Engineering for Developers》、《Building Systems with the ChatGPT API》等教程作为由吴恩达老师与 OpenAI 联合推出的官方教程,在可预见的未来会成为 LLM 的重要入门教程,但是目前还只支持英文版且国内访问受限,打造中文版且国内流畅访问的教程具有重要意义;同时,GPT 对中文、英文具有不同的理解能力,本教程在多次对比、实验之后确定了效果大致相当的中文 Prompt,支持学习者研究如何提升 ChatGPT 在中文语境下的理解与生成能力。\n","学习指南\n","本教程适用于所有具备基础 Python 能力,想要入门 LLM 的开发者。\n","如果你想要开始学习本教程,你需要提前具备:\n","\n","至少一个 LLM API(最好是 OpenAI,如果是其他 API,你可能需要参考其他教程对 API 调用代码进行修改)\n","能够使用 Python Jupyter Notebook\n","\n","本教程共包括 11 门课程,分为必修类、选修类两个类别。必修类课程是我们认为最适合初学者学习以入门 LLM 的课程,包括了入门 LLM 所有方向都需要掌握的基础技能和概念,我们也针对必修类课程制作了适合阅读的在线阅读和 PDF 版本,在学习必修类课程时,我们建议学习者按照我们列出的顺序进行学习;选修类课程是在必修类课程上的拓展延伸,包括了 RAG 开发、模型微调、模型评估等多个方面,适合学习者在掌握了必修类课程之后选择自己感兴趣的方向和课程进行学习。\n","必修类课程包括:\n","\n","面向开发者的 Prompt Engineering。基于吴恩达老师《ChatGPT Prompt Engineering for Developers》课程打造,面向入门 LLM 的开发者,深入浅出地介绍了对于开发者,如何构造 Prompt 并基于 OpenAI 提供的 API 实现包括总结、推断、转换等多种常用功能,是入门 LLM 开发的第一步。\n","搭建基于 ChatGPT 的问答系统。基于吴恩达老师《Building Systems with the ChatGPT API》课程打造,指导开发者如何基于 ChatGPT 提供的 API 开发一个完整的、全面的智能问答系统。通过代码实践,实现了基于 ChatGPT 开发问答系统的全流程,介绍了基于大模型开发的新范式,是大模型开发的实践基础。\n","使用 LangChain 开发应用程序。基于吴恩达老师《LangChain for LLM Application Development》课程打造,对 LangChain 展开深入介绍,帮助学习者了解如何使用 LangChain,并基于 LangChain 开发完整的、具备强大能力的应用程序。\n","使用 LangChain 访问个人数据。基于吴恩达老师《LangChain Chat with Your Data》课程打造,深入拓展 LangChain 提供的个人数据访问能力,指导开发者如何使用 LangChain 开发能够访问用户个人数据、提供个性化服务的大模型应用。\n","\n","选修类课程包括:\n","\n","使用 Gradio 搭建生成式 AI 应用。基于吴恩达老师《Building Generative AI Applications with Gradio》课程打造,指导开发者如何使用 Gradio 通过 Python 接口程序快速、高效地为生成式 AI 构建用户界面。\n","评估改进生成式 AI。基于吴恩达老师《Evaluating and Debugging Generative AI》课程打造,结合 wandb,提供一套系统化的方法和工具,帮助开发者有效地跟踪和调试生成式 AI 模型。\n","微调大语言模型。基于吴恩达老师《Finetuning Large Language Model》课程打造,结合 lamini 框架,讲述如何便捷高效地在本地基于个人数据微调开源大语言模型。\n","大模型与语义检索。基于吴恩达老师《Large Language Models with Semantic Search》课程打造,针对检索增强生成,讲述了多种高级检索技巧以实现更准确、高效的检索增强 LLM 生成效果。\n","基于 Chroma 的高级检索。基于吴恩达老师《Advanced Retrieval for AI with Chroma》课程打造,旨在介绍基于 Chroma 的高级检索技术,提升检索结果的准确性。\n","搭建和评估高级 RAG 应用。基于吴恩达老师《Building and Evaluating Advanced RAG Applications》课程打造,介绍构建和实现高质量RAG系统所需的关键技术和评估框架。\n","LangChain 的 Functions、Tools 和 Agents。基于吴恩达老师《Functions, Tools and Agents with LangChain》课程打造,介绍如何基于 LangChain 的新语法构建 Agent。\n","Prompt 高级技巧。包括 CoT、自我一致性等多种 Prompt 高级技巧的基础理论与代码实现。\n","\n","其他资料包括:\n","双语字幕视频地址:吴恩达 x OpenAI的Prompt Engineering课程专业翻译版\n","中英双语字幕下载:《ChatGPT提示工程》非官方版中英双语字幕\n","视频讲解:面向开发者的 Prompt Engineering 讲解(数字游民大会)\n","目录结构说明:\n","content:基于原课程复现的双语版代码,可运行的 Notebook,更新频率最高,更新速度最快。\n","\n","docs:必修类课程文字教程版在线阅读源码,适合阅读的 md。\n","\n","figures:图片文件。\n","\n","致谢\n","核心贡献者\n","\n","邹雨衡-项目负责人(Datawhale成员-对外经济贸易大学研究生)\n","长琴-项目发起人(内容创作者-Datawhale成员-AI算法工程师)\n","玉琳-项目发起人(内容创作者-Datawhale成员)\n","徐虎-教程编撰者(内容创作者-Datawhale成员)\n","刘伟鸿-教程编撰者(内容创作者-江南大学非全研究生)\n","Joye-教程编撰者(内容创作者-数据科学家)\n","高立业(内容创作者-DataWhale成员-算法工程师)\n","邓宇文(内容创作者-Datawhale成员)\n","魂兮(内容创作者-前端工程师)\n","宋志学(内容创作者-Datawhale成员)\n","韩颐堃(内容创作者-Datawhale成员)\n","陈逸涵 (内容创作者-Datawhale意向成员-AI爱好者)\n","仲泰(内容创作者-Datawhale成员)\n","万礼行(内容创作者-视频翻译者)\n","王熠明(内容创作者-Datawhale成员)\n","曾浩龙(内容创作者-Datawhale 意向成员-JLU AI 研究生)\n","小饭同学(内容创作者)\n","孙韩玉(内容创作者-算法量化部署工程师)\n","张银晗(内容创作者-Datawhale成员)\n","左春生(内容创作者-Datawhale成员)\n","张晋(内容创作者-Datawhale成员)\n","李娇娇(内容创作者-Datawhale成员)\n","邓恺俊(内容创作者-Datawhale成员)\n","范致远(内容创作者-Datawhale成员)\n","周景林(内容创作者-Datawhale成员)\n","诸世纪(内容创作者-算法工程师)\n","Zhang Yixin(内容创作者-IT爱好者)\n","Sarai(内容创作者-AI应用爱好者)\n","\n","其他\n","\n","特别感谢 @Sm1les、@LSGOMYP 对本项目的帮助与支持;\n","感谢 GithubDaily 提供的双语字幕;\n","如果有任何想法可以联系我们 DataWhale 也欢迎大家多多提出 issue;\n","特别感谢以下为教程做出贡献的同学!\n","\n","\n","\n","\n","Made with contrib.rocks.\n","Star History\n","\n","关注我们\n","\n","扫描下方二维码关注公众号:Datawhale\n","\n","\n","Datawhale 是一个专注于数据科学与 AI 领域的开源组织,汇集了众多领域院校和知名企业的优秀学习者,聚合了一群有开源精神和探索精神的团队成员。微信搜索公众号Datawhale可以加入我们。\n","LICENSE\n","本作品采用知识共享署名-非商业性使用-相同方式共享 4.0 \n"]}],"source":["start_index = doc.page_content.find(\"面向开发者的大模型手册\")\n","end_index = doc.page_content.find(\"国际许可协议进行许可。\")\n","print(doc.page_content[start_index:end_index])"]},{"cell_type":"markdown","id":"d35e99b4-dd67-4940-bbeb-b2a59bf8cd3d","metadata":{},"source":["## 三、Notion文档\n","\n","- 点击[Notion示例文档](https://yolospace.notion.site/Blendle-s-Employee-Handbook-e31bff7da17346ee99f531087d8b133f)右上方复制按钮(Duplicate),复制文档到你的Notion空间\n","- 点击右上方`⋯` 按钮,选择导出为Mardown&CSV。导出的文件将为zip文件夹\n","- 解压并保存mardown文档到本地路径`docs/Notion_DB/`"]},{"cell_type":"markdown","id":"f8cf2778-288c-4964-81e7-0ed881e31652","metadata":{},"source":["### 3.1 加载Notion Markdown文档"]},{"cell_type":"code","execution_count":16,"id":"081f5ee4-6b5d-45bf-a7e6-079abc560729","metadata":{},"outputs":[],"source":["from langchain.document_loaders import NotionDirectoryLoader\n","loader = NotionDirectoryLoader(\"docs/Notion_DB\")\n","docs = loader.load()"]},{"cell_type":"markdown","id":"88d5d094-a490-4c64-ab93-5c5cec0853aa","metadata":{"tags":[]},"source":["### 3.2 探索加载的数据"]},{"cell_type":"markdown","id":"a3ffe318-d22a-4687-b613-f679ad9ad616","metadata":{},"source":["文档加载后储存在`docs`变量中:\n","- `docs`的变量类型为`List`\n","- 打印 `docs` 的长度可以看到一共包含多少页"]},{"cell_type":"code","execution_count":17,"id":"106323dd-0d24-40d4-8302-ed2a35d13347","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["\n"]}],"source":["print(type(docs))"]},{"cell_type":"code","execution_count":18,"id":"fa3ee0fe-7daa-4193-9ee1-4ee89cb3b843","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["51\n"]}],"source":["print(len(docs))"]},{"cell_type":"markdown","id":"b3cd3feb-d2c1-46b6-8b16-d4b2101c5632","metadata":{},"source":["`docs`中的每一元素为一个文档,变量类型为`langchain.schema.document.Document`, 文档变量类型包含两个属性\n","- `page_content` 包含该文档的内容。\n","- `meta_data` 为文档相关的描述性数据。"]},{"cell_type":"code","execution_count":19,"id":"5b220df6-fd0b-4b62-9da4-29962926fe87","metadata":{},"outputs":[],"source":["doc = docs[0]"]},{"cell_type":"code","execution_count":20,"id":"98ef8fbf-e820-41de-919d-c462a910f4f1","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["\n"]}],"source":["print(type(doc))"]},{"cell_type":"code","execution_count":21,"id":"933da0f3-4d5f-4363-9142-050ecf226c1f","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["# Firing\n","\n","Saying goodbye to Blendle (from a colleague) and to a colleague (from Blendle) is a very normal and natural thing. When done right, it can even be a beautiful thing. \n","\n","We advise you to read the backdrop below first, but feel free to jump in right away with the 'Here's what you can do'-section :). General note: you do not have to do this alone, so please ask for advice and help!\n","\n","- **Backdrop**\n"," \n"," Saying goodbye to Blendle (from a colleague) and to a colleague (from Blendle) is a \n"]}],"source":["print(doc.page_content[0:500])"]}],"metadata":{"kernelspec":{"display_name":"Python 3 (ipykernel)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.9"}},"nbformat":4,"nbformat_minor":5}
+{"cells":[{"cell_type":"markdown","id":"cc2eb3ad-8c1c-406a-b7aa-a3f61b754ac5","metadata":{},"source":["# 第二章 文档加载\n","\n"," - [一、PDF文档](#一、PDF文档)\n"," - [1.1 安装相关包 ](#1.1-安装相关包-)\n"," - [1.2 加载PDF文档](#1.2-加载PDF文档)\n"," - [1.3 探索加载的数据](#1.3-探索加载的数据)\n"," - [二、YouTube音频](#二、YouTube音频)\n"," - [2.1 安装相关包 ](#2.1-安装相关包-)\n"," - [2.2 加载Youtube音频文档](#2.2-加载Youtube音频文档)\n"," - [2.3 探索加载的数据](#2.3-探索加载的数据)\n"," - [三、网页文档](#三、网页文档)\n"," - [3.1 加载网页文档](#3.1-加载网页文档)\n"," - [3.2 探索加载的数据](#3.2-探索加载的数据)\n"," - [四、Notion文档](#四、Notion文档)\n"," - [4.1 加载Notion Markdown文档](#4.1-加载Notion-Markdown文档)\n"," - [4.2 探索加载的数据](#4.2-探索加载的数据)\n"]},{"cell_type":"markdown","id":"89e612a5","metadata":{},"source":["同前序课程一样,我们同样需要先配置环境以访问 OpenAI 提供的 API,配置方法如下,详细介绍请参见其他课程"]},{"cell_type":"code","execution_count":1,"id":"582125c3-2afb-4cca-b651-c1810a5e5c22","metadata":{},"outputs":[],"source":["# 下载最新版 LangChain\n","!pip install -q langchain --upgrade"]},{"cell_type":"code","execution_count":2,"id":"bb73a77f-e17c-45a2-b456-e3ad2bf0fb5c","metadata":{},"outputs":[],"source":["import os\n","import openai\n","import sys\n","sys.path.append('../..')\n","\n","from dotenv import load_dotenv, find_dotenv\n","_ = load_dotenv(find_dotenv()) \n","\n","openai.api_key = os.environ['OPENAI_API_KEY']"]},{"cell_type":"markdown","id":"63558db2-5279-4c1b-9bec-355ab04731e6","metadata":{},"source":["## 一、PDF文档\n","\n","首先,我们来加载一个[PDF文档](https://see.stanford.edu/materials/aimlcs229/transcripts/MachineLearning-Lecture01.pdf)。该文档为吴恩达教授的2009年机器学习课程的字幕文件。因为这些字幕为自动生成,所以词句直接可能不太连贯和通畅。"]},{"cell_type":"markdown","id":"dd5fe85c-6aae-4739-9b47-68e791afc9ac","metadata":{},"source":["### 1.1 安装相关包 "]},{"cell_type":"code","execution_count":3,"id":"c527f944-35dc-44a2-9cf9-9887cf315f3a","metadata":{},"outputs":[],"source":["!pip install -q pypdf"]},{"cell_type":"markdown","id":"8dcb2102-0414-4130-952b-3b6fa33b61bb","metadata":{},"source":["### 1.2 加载PDF文档"]},{"cell_type":"code","execution_count":4,"id":"52d9891f-a8cc-47c4-8c09-81794647a720","metadata":{},"outputs":[],"source":["from langchain.document_loaders import PyPDFLoader\n","\n","# 创建一个 PyPDFLoader Class 实例,输入为待加载的pdf文档路径\n","loader = PyPDFLoader(\"docs/cs229_lectures/MachineLearning-Lecture01.pdf\")\n","\n","# 调用 PyPDFLoader Class 的函数 load对pdf文件进行加载\n","pages = loader.load()"]},{"cell_type":"markdown","id":"68d40600-49ab-42a3-97d0-b9a2c4ab8139","metadata":{},"source":["### 1.3 探索加载的数据"]},{"cell_type":"markdown","id":"feca9f1e-1596-49f2-a6d9-6eeaeffbd90b","metadata":{},"source":["文档加载后储存在`pages`变量中:\n","- `page`的变量类型为`List`\n","- 打印 `pages` 的长度可以看到pdf一共包含多少页"]},{"cell_type":"code","execution_count":5,"id":"9463b982-c71c-4241-b3a3-b040170eef2e","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["\n"]}],"source":["print(type(pages))"]},{"cell_type":"code","execution_count":6,"id":"67a2b815-586f-43a5-96a4-cfe46001a766","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["22\n"]}],"source":["print(len(pages))"]},{"cell_type":"markdown","id":"2cde6b9d-71c8-4851-a8f6-a3f0e76f6dab","metadata":{},"source":["`page`中的每一元素为一个文档,变量类型为`langchain.schema.Document`, 文档变量类型包含两个属性\n","- `page_content` 包含该文档的内容。\n","- `meta_data` 为文档相关的描述性数据。"]},{"cell_type":"code","execution_count":7,"id":"921827ae-3a5b-4f29-b015-a5dde3be1410","metadata":{},"outputs":[],"source":["page = pages[0]"]},{"cell_type":"code","execution_count":8,"id":"aadaa840-0f30-4ae3-b06b-7fe8f468d146","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["\n"]}],"source":["print(type(page))"]},{"cell_type":"code","execution_count":9,"id":"85777ce2-42c7-4e11-b1ba-06fd6a0d8502","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["MachineLearning-Lecture01 \n","Instructor (Andrew Ng): Okay. Good morning. Welcome to CS229, the machine \n","learning class. So what I wanna do today is ju st spend a little time going over the logistics \n","of the class, and then we'll start to talk a bit about machine learning. \n","By way of introduction, my name's Andrew Ng and I'll be instru ctor for this class. And so \n","I personally work in machine learning, and I' ve worked on it for about 15 years now, and \n","I actually think that machine learning i\n"]}],"source":["print(page.page_content[0:500])"]},{"cell_type":"code","execution_count":10,"id":"8a1f8acd-f8c7-46af-a29f-df172067deba","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["{'source': 'docs/cs229_lectures/MachineLearning-Lecture01.pdf', 'page': 0}\n"]}],"source":["print(page.metadata)"]},{"cell_type":"markdown","id":"1e9cead7-a967-4a8f-8d3d-0f94f2ff129e","metadata":{},"source":["## 二、YouTube音频\n","\n","在第一部分的内容,我们学习了如何加载PDF文档。在这部分的内容,我们学习对于给定的 YouTube 视频链接\n","- 如何使用LangChain加载器将视频的音频下载到本地\n","- 然后使用OpenAIWhisperPaser解析器将音频转化为文本"]},{"cell_type":"markdown","id":"b4720268-ddab-4c18-9072-10aab8f0ac7c","metadata":{},"source":["### 2.1 安装相关包 "]},{"cell_type":"code","execution_count":11,"id":"37dbeb50-d6c5-4db0-88da-1ef9d3e47417","metadata":{},"outputs":[],"source":["!pip -q install yt_dlp\n","!pip -q install pydub"]},{"cell_type":"markdown","id":"a243b258-eae3-46b0-803f-cd897b31cf78","metadata":{},"source":["### 2.2 加载Youtube音频文档"]},{"cell_type":"code","execution_count":12,"id":"f25593f9-a6d2-4137-94cb-881141ca99fd","metadata":{},"outputs":[],"source":["from langchain.document_loaders.generic import GenericLoader\n","from langchain.document_loaders.parsers import OpenAIWhisperParser\n","from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader"]},{"cell_type":"code","execution_count":2,"id":"5ca7b99a-ba4d-4989-aed6-be76acb405c0","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["[youtube] Extracting URL: https://www.youtube.com/watch?v=jGwO_UgTS7I\n","[youtube] jGwO_UgTS7I: Downloading webpage\n","[youtube] jGwO_UgTS7I: Downloading ios player API JSON\n","[youtube] jGwO_UgTS7I: Downloading android player API JSON\n","[youtube] jGwO_UgTS7I: Downloading m3u8 information\n","[info] jGwO_UgTS7I: Downloading 1 format(s): 140\n","[download] docs/youtube//Stanford CS229: Machine Learning Course, Lecture 1 - Andrew Ng (Autumn 2018).m4a has already been downloaded\n","[download] 100% of 69.76MiB\n","[ExtractAudio] Not converting audio docs/youtube//Stanford CS229: Machine Learning Course, Lecture 1 - Andrew Ng (Autumn 2018).m4a; file is already in target format m4a\n","Transcribing part 1!\n","Transcribing part 2!\n","Transcribing part 3!\n","Transcribing part 4!\n"]}],"source":["url=\"https://www.youtube.com/watch?v=jGwO_UgTS7I\"\n","save_dir=\"docs/youtube/\"\n","\n","# 创建一个 GenericLoader Class 实例\n","loader = GenericLoader(\n"," #将链接url中的Youtube视频的音频下载下来,存在本地路径save_dir\n"," YoutubeAudioLoader([url],save_dir), \n"," \n"," #使用OpenAIWhisperPaser解析器将音频转化为文本\n"," OpenAIWhisperParser()\n",")\n","\n","# 调用 GenericLoader Class 的函数 load对视频的音频文件进行加载\n","docs = loader.load()"]},{"cell_type":"markdown","id":"ffb4db5d-39b5-4cd7-82d9-824ed71fc116","metadata":{"tags":[]},"source":["### 2.3 探索加载的数据"]},{"cell_type":"markdown","id":"0fd91c34-ac19-4a09-8ca0-99262011d9ba","metadata":{},"source":["文档加载后储存在`docs`变量中:\n","- `docs`的变量类型为`List`\n","- 打印 `docs` 的长度可以看到一共包含多少页"]},{"cell_type":"code","execution_count":15,"id":"ddb89cee-32bd-4c5f-91f1-c46d1f0300da","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["\n"]}],"source":["print(type(docs))"]},{"cell_type":"code","execution_count":16,"id":"2cea4ff3-8548-4158-9e55-a574de0fd29e","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["1\n"]}],"source":["print(len(docs))"]},{"cell_type":"markdown","id":"24952655-128e-4a7b-b8c0-e93156acbe1b","metadata":{},"source":["`docs`中的每一元素为一个文档,变量类型为`langchain.schema.document.Document`, 文档变量类型包含两个属性\n","- `page_content` 包含该文档的内容。\n","- `meta_data` 为文档相关的描述性数据。"]},{"cell_type":"code","execution_count":19,"id":"d89bf91d-d39b-4682-9c56-0cde449d6051","metadata":{},"outputs":[],"source":["doc = docs[0]"]},{"cell_type":"code","execution_count":20,"id":"e6df253f-ad9e-42d5-b6e1-47b7c0d2d564","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["\n"]}],"source":["print(type(doc))"]},{"cell_type":"code","execution_count":13,"id":"80e4b798-875e-4f0e-ba16-5277f8ec1f62","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Welcome to CS229 Machine Learning. Uh, some of you know that this is a class that's taught at Stanford for a long time. And this is often the class that, um, I most look forward to teaching each year because this is where we've helped, I think, several generations of Stanford students become experts in machine learning, got- built many of their products and services and startups that I'm sure, many of you or probably all of you are using, uh, uh, today. Um, so what I want to do today was spend s\n"]}],"source":["print(doc.page_content[0:500])"]},{"cell_type":"code","execution_count":14,"id":"7f363e33-6a4d-4b78-aa7d-1b8cf6b59567","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["{'source': 'docs/youtube/Stanford CS229: Machine Learning Course, Lecture 1 - Andrew Ng (Autumn 2018).m4a', 'chunk': 0}\n"]}],"source":["print(doc.metadata)"]},{"cell_type":"markdown","id":"5b7ddc7d-2d40-4811-8cb3-5e73344ebe24","metadata":{},"source":["## 三、网页文档\n","\n","在第二部分,我们对于给定的 YouTube 视频链接 (URL),使用 LangChain 加载器将视频的音频下载到本地,然后使用 OpenAIWhisperPaser 解析器将音频转化为文本。\n","\n","本部分,对于给定网页文档链接(URLs),我们学习如何对其进行加载。这里我们对Github上的网页文档进行加载,该文档格式为markdown。"]},{"cell_type":"markdown","id":"b28abf4d-4907-47f6-b54d-6d322a5794e6","metadata":{},"source":["### 3.1 加载网页文档"]},{"cell_type":"code","execution_count":21,"id":"1a68375f-44ae-4905-bf9c-1f01ec800481","metadata":{},"outputs":[],"source":["from langchain.document_loaders import WebBaseLoader\n","\n","\n","# 创建一个 WebBaseLoader Class 实例\n","url = \"https://github.com/basecamp/handbook/blob/master/37signals-is-you.md\"\n","header = {'User-Agent': 'python-requests/2.27.1', \n"," 'Accept-Encoding': 'gzip, deflate, br', \n"," 'Accept': '*/*',\n"," 'Connection': 'keep-alive'}\n","loader = WebBaseLoader(web_path=url,header_template=header)\n","\n","# 调用 WebBaseLoader Class 的函数 load对文件进行加载\n","docs = loader.load()"]},{"cell_type":"markdown","id":"fc24f44a-01f5-49a3-9529-2f05c1053b2c","metadata":{"tags":[]},"source":["### 3.2 探索加载的数据"]},{"cell_type":"markdown","id":"f2f108b9-713b-4b98-b44d-4dfc3dcbcde2","metadata":{},"source":["文档加载后储存在`docs`变量中:\n","- `docs`的变量类型为`List`\n","- 打印 `docs` 的长度可以看到一共包含多少页"]},{"cell_type":"code","execution_count":22,"id":"8c8670fa-203b-4c35-9266-f976f50f0f5d","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["\n"]}],"source":["print(type(docs))"]},{"cell_type":"code","execution_count":23,"id":"0e85a526-55e7-4186-8697-d16a891bcabc","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["1\n"]}],"source":["print(len(docs))"]},{"cell_type":"markdown","id":"b4397791-4d3c-4609-86be-3cda90a3f2fc","metadata":{},"source":["`docs`中的每一元素为一个文档,变量类型为`langchain.schema.document.Document`, 文档变量类型包含两个属性\n","- `page_content` 包含该文档的内容。\n","- `meta_data` 为文档相关的描述性数据。"]},{"cell_type":"code","execution_count":24,"id":"26423c51-6503-478c-b17d-bbe57049a04c","metadata":{},"outputs":[],"source":["doc = docs[0]"]},{"cell_type":"code","execution_count":25,"id":"a243638f-0c23-46b8-8854-13f1f1de6f0a","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["\n"]}],"source":["print(type(doc))"]},{"cell_type":"code","execution_count":26,"id":"9f237541-79b7-4ae4-9e0e-27a28af99b7a","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["{\"payload\":{\"allShortcutsEnabled\":false,\"fileTree\":{\"\":{\"items\":[{\"name\":\"37signals-is-you.md\",\"path\":\"37signals-is-you.md\",\"contentType\":\"file\"},{\"name\":\"LICENSE.md\",\"path\":\"LICENSE.md\",\"contentType\":\"file\"},{\"name\":\"README.md\",\"path\":\"README.md\",\"contentType\":\"file\"},{\"name\":\"benefits-and-perks.md\",\"path\":\"benefits-and-perks.md\",\"contentType\":\"file\"},{\"name\":\"code-of-conduct.md\",\"path\":\"code-of-conduct.md\",\"contentType\":\"file\"},{\"name\":\"faq.md\",\"path\":\"faq.md\",\"contentType\":\"file\"},{\"name\":\"getting-started.md\",\"path\":\"getting-started.md\",\"contentType\":\"file\"},{\"name\":\"how-we-work.md\",\"path\":\"how-we-work.md\",\"contentType\":\"file\"},{\"name\":\"international-travel-guide.md\",\"path\":\"international-travel-guide.md\",\"contentType\":\"file\"},{\"name\":\"making-a-career.md\",\"path\":\"making-a-career.md\",\"contentType\":\"file\"},{\"name\":\"managing-work-devices.md\",\"path\":\"managing-work-devices.md\",\"contentType\":\"file\"},{\"name\":\"moonlighting.md\",\"path\":\"moonlighting.md\",\"contentType\":\"file\"},{\"name\":\"our-internal-systems.md\",\"path\":\"our-internal-systems.md\",\"contentType\":\"file\"},{\"name\":\"our-rituals.md\",\"path\":\"our-rituals.md\",\"contentType\":\"file\"},{\"name\":\"performance-plans.md\",\"path\":\"performance-plans.md\",\"contentType\":\"file\"},{\"name\":\"product-histories.md\",\"path\":\"product-histories.md\",\"contentType\":\"file\"},{\"name\":\"stateFMLA.md\",\"path\":\"stateFMLA.md\",\"contentType\":\"file\"},{\"name\":\"titles-for-data.md\",\"path\":\"titles-for-data.md\",\"contentType\":\"file\"},{\"name\":\"titles-for-designers.md\",\"path\":\"titles-for-designers.md\",\"contentType\":\"file\"},{\"name\":\"titles-for-ops.md\",\"path\":\"titles-for-ops.md\",\"contentType\":\"file\"},{\"name\":\"titles-for-programmers.md\",\"path\":\"titles-for-programmers.md\",\"contentType\":\"file\"},{\"name\":\"titles-for-support.md\",\"path\":\"titles-for-support.md\",\"contentType\":\"file\"},{\"name\":\"vocabulary.md\",\"path\":\"vocabulary.md\",\"contentType\":\"file\"},{\"name\":\"what-influenced-us.md\",\"path\":\"what-influenced-us.md\",\"contentType\":\"file\"},{\"name\":\"what-we-stand-for.md\",\"path\":\"what-we-stand-for.md\",\"contentType\":\"file\"},{\"name\":\"where-we-work.md\",\"path\":\"where-we-work.md\",\"contentType\":\"file\"}],\"totalCount\":26}},\"fileTreeProcessingTime\":3.936437,\"foldersToFetch\":[],\"reducedMotionEnabled\":null,\"repo\":{\"id\":90042196,\"defaultBranch\":\"master\",\"name\":\"handbook\",\"ownerLogin\":\"basecamp\",\"currentUserCanPush\":false,\"isFork\":false,\"isEmpty\":false,\"createdAt\":\"2017-05-02T14:23:23.000Z\",\"ownerAvatar\":\"https://avatars.githubusercontent.com/u/13131?v=4\",\"public\":true,\"private\":false,\"isOrgOwned\":true},\"refInfo\":{\"name\":\"master\",\"listCacheKey\":\"v0:1682672280.0\",\"canEdit\":false,\"refType\":\"branch\",\"currentOid\":\"1577f27c63aa8df61996924824afb8df6f1bf20e\"},\"path\":\"37signals-is-you.md\",\"currentUser\":null,\"blob\":{\"rawBlob\":null,\"colorizedLines\":null,\"stylingDirectives\":null,\"csv\":null,\"csvError\":null,\"dependabotInfo\":{\"showConfigurationBanner\":false,\"configFilePath\":null,\"networkDependabotPath\":\"/basecamp/handbook/network/updates\",\"dismissConfigurationNoticePath\":\"/settings/dismiss-notice/dependabot_configuration_notice\",\"configurationNoticeDismissed\":null,\"repoAlertsPath\":\"/basecamp/handbook/security/dependabot\",\"repoSecurityAndAnalysisPath\":\"/basecamp/handbook/settings/security_analysis\",\"repoOwnerIsOrg\":true,\"currentUserCanAdminRepo\":false},\"displayName\":\"37signals-is-you.md\",\"displayUrl\":\"https://github.com/basecamp/handbook/blob/master/37signals-is-you.md?raw=true\",\"headerInfo\":{\"blobSize\":\"2.19 KB\",\"deleteInfo\":{\"deletePath\":null,\"deleteTooltip\":\"You must be signed in to make or propose changes\"},\"editInfo\":{\"editTooltip\":\"You must be signed in to make or propose changes\"},\"ghDesktopPath\":\"https://desktop.github.com\",\"gitLfsPath\":null,\"onBranch\":true,\"shortPath\":\"e5ca0f0\",\"siteNavLoginPath\":\"/login?return_to=https%3A%2F%2Fgithub.com%2Fbasecamp%2Fhandbook%2Fblob%2Fmaster%2F37signals-is-you.md\",\"isCSV\":false,\"isRichtext\":true,\"toc\":[{\"level\":1,\"text\":\"37signals Is You\",\"anchor\":\"37signals-is-you\",\"htmlText\":\"37signals Is You\"}],\"lineInfo\":{\"truncatedLoc\":\"11\",\"truncatedSloc\":\"6\"},\"mode\":\"file\"},\"image\":false,\"isCodeownersFile\":null,\"isValidLegacyIssueTemplate\":false,\"issueTemplateHelpUrl\":\"https://docs.github.com/articles/about-issue-and-pull-request-templates\",\"issueTemplate\":null,\"discussionTemplate\":null,\"language\":\"Markdown\",\"large\":false,\"loggedIn\":false,\"newDiscussionPath\":\"/basecamp/handbook/discussions/new\",\"newIssuePath\":\"/basecamp/handbook/issues/new\",\"planSupportInfo\":{\"repoIsFork\":null,\"repoOwnedByCurrentUser\":null,\"requestFullPath\":\"/basecamp/handbook/blob/master/37signals-is-you.md\",\"showFreeOrgGatedFeatureMessage\":null,\"showPlanSupportBanner\":null,\"upgradeDataAttributes\":null,\"upgradePath\":null},\"publishBannersInfo\":{\"dismissActionNoticePath\":\"/settings/dismiss-notice/publish_action_from_dockerfile\",\"dismissStackNoticePath\":\"/settings/dismiss-notice/publish_stack_from_file\",\"releasePath\":\"/basecamp/handbook/releases/new?marketplace=true\",\"showPublishActionBanner\":false,\"showPublishStackBanner\":false},\"renderImageOrRaw\":false,\"richText\":\"37signals Is You\\nEveryone working at 37signals represents 37signals. When a customer gets a response from Merissa on support, Merissa is 37signals. When a customer reads a tweet by Eron that our systems are down, Eron is 37signals. In those situations, all the other stuff we do to cultivate our best image is secondary. What’s right in front of someone in a time of need is what they’ll remember.\\nThat’s what we mean when we say marketing is everyone’s responsibility, and that it pays to spend the time to recognize that. This means avoiding the bullshit of outage language and bending our policies, not just lending your ears. It means taking the time to get the writing right and consider how you’d feel if you were on the other side of the interaction.\\nThe vast majority of our customers come from word of mouth and much of that word comes from people in our audience. This is an audience we’ve been educating and entertaining for 20 years and counting, and your voice is part of us now, whether you like it or not! Tell us and our audience what you have to say!\\nThis goes for tools and techniques as much as it goes for prose. 37signals not only tries to out-teach the competition, but also out-share and out-collaborate. We’re prolific open source contributors through Ruby on Rails, Trix, Turbolinks, Stimulus, and many other projects. Extracting the common infrastructure that others could use as well is satisfying, important work, and we should continue to do that.\\nIt’s also worth mentioning that joining 37signals can be all-consuming. We’ve seen it happen. You dig 37signals, so you feel pressure to contribute, maybe overwhelmingly so. The people who work here are some of the best and brightest in our industry, so the self-imposed burden to be exceptional is real. But here’s the thing: stop it. Settle in. We’re glad you love this job because we all do too, but at the end of the day it’s a job. Do your best work, collaborate with your team, write, read, learn, and then turn off your computer and play with your dog. We’ll all be better for it.\\n\",\"renderedFileInfo\":null,\"tabSize\":8,\"topBannersInfo\":{\"overridingGlobalFundingFile\":false,\"globalPreferredFundingPath\":null,\"repoOwner\":\"basecamp\",\"repoName\":\"handbook\",\"showInvalidCitationWarning\":false,\"citationHelpUrl\":\"https://docs.github.com/en/github/creating-cloning-and-archiving-repositories/creating-a-repository-on-github/about-citation-files\",\"showDependabotConfigurationBanner\":false,\"actionsOnboardingTip\":null},\"truncated\":false,\"viewable\":true,\"workflowRedirectUrl\":null,\"symbols\":{\"timedOut\":false,\"notAnalyzed\":true,\"symbols\":[]}},\"csrf_tokens\":{\"/basecamp/handbook/branches\":{\"post\":\"o3HTNEDyuKtINffBkguVz-P3KUwBN04ZM_vvyoNKymcy66lDUtXVvEi7EvsbgFoz2d3qgU_earsuIftbbtKlcg\"}}},\"title\":\"handbook/37signals-is-you.md at master · basecamp/handbook\",\"locale\":\"en\"}\n"]}],"source":["print(doc.page_content)"]},{"cell_type":"markdown","id":"26538237-1fc2-4915-944a-1f68f3ae3759","metadata":{},"source":[" "]},{"cell_type":"markdown","id":"52025103-205e-4137-a116-89f37fcfece1","metadata":{},"source":["可以看到上面的文档内容包含许多冗余的信息。通常来讲,我们需要进行对这种数据进行进一步处理(Post Processing)。"]},{"cell_type":"code","execution_count":27,"id":"a7c5281f-aeed-4ee7-849b-bbf9fd3e35c7","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["37signals Is You\n","Everyone working at 37signals represents 37signals. When a customer gets a response from Merissa on support, Merissa is 37signals. When a customer reads a tweet by Eron that our systems are down, Eron is 37signals. In those situations, all the other stuff we do to cultivate our best image is secondary. What’s right in front of someone in a time of need is what they’ll remember.\n","That’s what we mean when we say marketing is everyone’s responsibility, and that it pays to spend the time to recognize that. This means avoiding the bullshit of outage language and bending our policies, not just lending your ears. It means taking the time to get the writing right and consider how you’d feel if you were on the other side of the interaction.\n","The vast majority of our customers come from word of mouth and much of that word comes from people in our audience. This is an audience we’ve been educating and entertaining for 20 years and counting, and your voice is part of us now, whether you like it or not! Tell us and our audience what you have to say!\n","This goes for tools and techniques as much as it goes for prose. 37signals not only tries to out-teach the competition, but also out-share and out-collaborate. We’re prolific open source contributors through Ruby on Rails, Trix, Turbolinks, Stimulus, and many other projects. Extracting the common infrastructure that others could use as well is satisfying, important work, and we should continue to do that.\n","It’s also worth mentioning that joining 37signals can be all-consuming. We’ve seen it happen. You dig 37signals, so you feel pressure to contribute, maybe overwhelmingly so. The people who work here are some of the best and brightest in our industry, so the self-imposed burden to be exceptional is real. But here’s the thing: stop it. Settle in. We’re glad you love this job because we all do too, but at the end of the day it’s a job. Do your best work, collaborate with your team, write, read, learn, and then turn off your computer and play with your dog. We’ll all be better for it.\n","\n"]}],"source":["import json\n","convert_to_json = json.loads(doc.page_content)\n","extracted_markdow = convert_to_json['payload']['blob']['richText']\n","print(extracted_markdow)"]},{"cell_type":"markdown","id":"d35e99b4-dd67-4940-bbeb-b2a59bf8cd3d","metadata":{},"source":["## 四、Notion文档\n","\n","- 点击[Notion示例文档](https://yolospace.notion.site/Blendle-s-Employee-Handbook-e31bff7da17346ee99f531087d8b133f)右上方复制按钮(Duplicate),复制文档到你的Notion空间\n","- 点击右上方`⋯` 按钮,选择导出为Mardown&CSV。导出的文件将为zip文件夹\n","- 解压并保存mardown文档到本地路径`docs/Notion_DB/`"]},{"cell_type":"markdown","id":"f8cf2778-288c-4964-81e7-0ed881e31652","metadata":{},"source":["### 4.1 加载Notion Markdown文档"]},{"cell_type":"code","execution_count":28,"id":"081f5ee4-6b5d-45bf-a7e6-079abc560729","metadata":{},"outputs":[],"source":["from langchain.document_loaders import NotionDirectoryLoader\n","loader = NotionDirectoryLoader(\"docs/Notion_DB\")\n","docs = loader.load()"]},{"cell_type":"markdown","id":"88d5d094-a490-4c64-ab93-5c5cec0853aa","metadata":{"tags":[]},"source":["### 4.2 探索加载的数据"]},{"cell_type":"markdown","id":"a3ffe318-d22a-4687-b613-f679ad9ad616","metadata":{},"source":["文档加载后储存在`docs`变量中:\n","- `docs`的变量类型为`List`\n","- 打印 `docs` 的长度可以看到一共包含多少页"]},{"cell_type":"code","execution_count":29,"id":"106323dd-0d24-40d4-8302-ed2a35d13347","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["\n"]}],"source":["print(type(docs))"]},{"cell_type":"code","execution_count":30,"id":"fa3ee0fe-7daa-4193-9ee1-4ee89cb3b843","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["1\n"]}],"source":["print(len(docs))"]},{"cell_type":"markdown","id":"b3cd3feb-d2c1-46b6-8b16-d4b2101c5632","metadata":{},"source":["`docs`中的每一元素为一个文档,变量类型为`langchain.schema.document.Document`, 文档变量类型包含两个属性\n","- `page_content` 包含该文档的内容。\n","- `meta_data` 为文档相关的描述性数据。"]},{"cell_type":"code","execution_count":31,"id":"5b220df6-fd0b-4b62-9da4-29962926fe87","metadata":{},"outputs":[],"source":["doc = docs[0]"]},{"cell_type":"code","execution_count":32,"id":"98ef8fbf-e820-41de-919d-c462a910f4f1","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["\n"]}],"source":["print(type(doc))"]},{"cell_type":"code","execution_count":33,"id":"933da0f3-4d5f-4363-9142-050ecf226c1f","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["# Blendle's Employee Handbook\n","\n","This is a living document with everything we've learned working with people while running a startup. And, of course, we continue to learn. Therefore it's a document that will continue to change. \n","\n","**Everything related to working at Blendle and the people of Blendle, made public.**\n","\n","These are the lessons from three years of working with the people of Blendle. It contains everything from [how our leaders lead](https://www.notion.so/ecfb7e647136468a9a0a32f1771a8f52?pv\n"]}],"source":["print(doc.page_content[0:500])"]}],"metadata":{"kernelspec":{"display_name":"Python 3 (ipykernel)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.12"}},"nbformat":4,"nbformat_minor":5}
diff --git "a/content/\345\277\205\344\277\256\345\233\233-LangChain Chat with Your Data/3.\346\226\207\346\241\243\345\210\206\345\211\262 Splitting.ipynb" "b/content/\345\277\205\344\277\256\345\233\233-LangChain Chat with Your Data/3.\346\226\207\346\241\243\345\210\206\345\211\262 Splitting.ipynb"
index 958c6310..d708f067 100644
--- "a/content/\345\277\205\344\277\256\345\233\233-LangChain Chat with Your Data/3.\346\226\207\346\241\243\345\210\206\345\211\262 Splitting.ipynb"
+++ "b/content/\345\277\205\344\277\256\345\233\233-LangChain Chat with Your Data/3.\346\226\207\346\241\243\345\210\206\345\211\262 Splitting.ipynb"
@@ -1 +1 @@
-{"cells":[{"cell_type":"markdown","metadata":{},"source":["# 第三章 文档分割\n","\n"," - [一、配置环境](#一、配置环境)\n"," - [二、文档分割重要性](#二、文档分割重要性)\n"," - [三、文档分割方式](#三、文档分割方式)\n"," - [3.1 基于字符的分割方式](#3.1-基于字符的分割方式)\n"," - [3.2 基于token的分割](#3.2-基于token的分割)\n"," - [3.3 分割Markdown文档](#3.3-分割Markdown文档)\n"]},{"cell_type":"markdown","metadata":{},"source":["我们刚刚讨论了如何将文档加载到标准格式中,现在我们要谈论如何将它们分割成较小的块。这听起来可能很简单,但其中有很多微妙之处会对后续工作产生重要影响。"]},{"cell_type":"markdown","metadata":{},"source":["## 一、配置环境"]},{"cell_type":"code","execution_count":1,"metadata":{},"outputs":[],"source":["import os\n","import openai\n","import sys\n","sys.path.append('../..')\n","\n","from dotenv import load_dotenv, find_dotenv\n","_ = load_dotenv(find_dotenv()) \n","\n","openai.api_key = os.environ['OPENAI_API_KEY']"]},{"attachments":{"image.png":{"image/png":""}},"cell_type":"markdown","metadata":{},"source":["## 二、文档分割重要性\n","数据加载完毕后进行分割,分割完毕后放入向量数据库,进行向量化\n","\n","\n","\n","如果直接按照单个字符进行分割,会丢失文本的语义信息,从而导致无法正确的回答问题。因此,需要尽量保持语义的完整性,分割出具有语义的块。"]},{"attachments":{"image-2.png":{"image/png":""},"image.png":{"image/png":""}},"cell_type":"markdown","metadata":{},"source":["## 三、文档分割方式\n","Langchain中文本分割器都根据chunk_size(块大小)和chunk_overlap(块与块之间的重叠大小)进行分割。\n","\n","\n","\n","* chunk_size指每个块包含的字符或Token(如单词、句子等)的数量\n","\n","* chunk_overlap指两个块之间共享的字符数量,用于保持上下文的连贯性,避免分割丢失上下文信息\n","\n","\n","\n","Langchain提供多种文档分割方式,区别在怎么确定块与块之间的边界、块由哪些字符/token组成、以及如何测量块大小"]},{"attachments":{},"cell_type":"markdown","metadata":{},"source":["### 3.1 基于字符的分割方式\n","怎么分割块取决于我们的任务类型,当您拆分代码时,这一点非常明显。所以我们有一个语言文本分割器,它有一堆不同的分隔符,用于各种Python、Ruby、C 等不同语言的版本。在拆分这些文档时,考虑这些不同的语言的差异性。"]},{"cell_type":"code","execution_count":2,"metadata":{},"outputs":[],"source":["\n","''' \n","* RecursiveCharacterTextSplitter 递归字符文本分割\n","* CharacterTextSplitter 字符分割\n","RecursiveCharacterTextSplitter 将按不同的字符递归地分割(按照这个优先级[\"\\n\\n\", \"\\n\", \" \", \"\"]),这样就能尽量把所有和语义相关的内容尽可能长时间地保留在同一位置\n","RecursiveCharacterTextSplitter需要关注的是4个参数:\n","\n","* separators - 分隔符字符串数组\n","* chunk_size - 每个文档的字符数量限制\n","* chunk_overlap - 两份文档重叠区域的长度\n","* length_function - 长度计算函数\n","CharacterTextSplitter 字符分割\n","分隔符的参数是单个的字符串。这就会使得RecursiveCharacterTextSplitter比CharacterTextSplitter对文档切割得更加碎片化\n","'''\n","#导入文本分割器\n","from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter\n"]},{"cell_type":"code","execution_count":3,"metadata":{},"outputs":[],"source":["chunk_size = 26 #设置块大小\n","chunk_overlap = 4 #设置块重叠大小\n"]},{"cell_type":"code","execution_count":4,"metadata":{},"outputs":[],"source":["#初始化文本分割器\n","r_splitter = RecursiveCharacterTextSplitter(\n"," chunk_size=chunk_size,\n"," chunk_overlap=chunk_overlap\n",")\n","c_splitter = CharacterTextSplitter(\n"," chunk_size=chunk_size,\n"," chunk_overlap=chunk_overlap\n",")\n"]},{"cell_type":"code","execution_count":5,"metadata":{},"outputs":[{"data":{"text/plain":["['abcdefghijklmnopqrstuvwxyz', 'wxyzabcdefg']"]},"execution_count":5,"metadata":{},"output_type":"execute_result"}],"source":["#使用递归字符文本分割器\n","text2 = \"abcdefghijklmnopqrstuvwxyzabcdefg\"\n","r_splitter.split_text(text2)\n"]},{"cell_type":"code","execution_count":6,"metadata":{},"outputs":[{"data":{"text/plain":["['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']"]},"execution_count":6,"metadata":{},"output_type":"execute_result"}],"source":["text3 = \"a b c d e f g h i j k l m n o p q r s t u v w x y z\"#测试文本\n","r_splitter.split_text(text3)\n"]},{"attachments":{},"cell_type":"markdown","metadata":{},"source":["第二块是从'wxyz'开始的,刚好是我们设定的块重叠大小"]},{"cell_type":"code","execution_count":7,"metadata":{},"outputs":[{"data":{"text/plain":["['a b c d e f g h i j k l m n o p q r s t u v w x y z']"]},"execution_count":7,"metadata":{},"output_type":"execute_result"}],"source":["#字符文本分割器\n","c_splitter.split_text(text3)\n"]},{"attachments":{},"cell_type":"markdown","metadata":{},"source":["可以看到字符分割器没有分割这个文本,因为字符文本分割器默认以换行符为分隔符,因此需要设置空格为分隔符。"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[{"data":{"text/plain":["['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']"]},"execution_count":8,"metadata":{},"output_type":"execute_result"}],"source":["# 设置空格分隔符\n","c_splitter = CharacterTextSplitter(\n"," chunk_size=chunk_size,\n"," chunk_overlap=chunk_overlap,\n"," separator=' '\n",")\n","c_splitter.split_text(text3)\n"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[],"source":["# 递归分割长段落\n","some_text1 = \"\"\"When writing documents, writers will use document structure to group content. \\\n","This can convey to the reader, which idea's are related. For example, closely related ideas \\\n","are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \\n\\n \\\n","Paragraphs are often delimited with a carriage return or two carriage returns. \\\n","Carriage returns are the \"backslash n\" you see embedded in this string. \\\n","Sentences have a period at the end, but also, have a space.\\\n","and words are separated by space.\"\"\"\n"]},{"cell_type":"code","execution_count":10,"metadata":{},"outputs":[{"data":{"text/plain":["496"]},"execution_count":10,"metadata":{},"output_type":"execute_result"}],"source":["len(some_text1)\n"]},{"cell_type":"code","execution_count":11,"metadata":{},"outputs":[],"source":["# 中文版\n","some_text2 = \"\"\"在编写文档时,作者将使用文档结构对内容进行分组。 \\\n"," 这可以向读者传达哪些想法是相关的。 例如,密切相关的想法\\\n"," 是在句子中。 类似的想法在段落中。 段落构成文档。 \\n\\n\\\n"," 段落通常用一个或两个回车符分隔。 \\\n"," 回车符是您在该字符串中看到的嵌入的“反斜杠 n”。 \\\n"," 句子末尾有一个句号,但也有一个空格。\\\n"," 并且单词之间用空格分隔\"\"\"\n"]},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[{"data":{"text/plain":["177"]},"execution_count":12,"metadata":{},"output_type":"execute_result"}],"source":["len(some_text2)\n"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[],"source":["''' \n","依次传入分隔符列表,分别是双换行符、单换行符、空格、空字符,\n","因此在分割文本时,首先会采用双分换行符进行分割,同时依次使用其他分隔符进行分割\n","'''\n","\n","c_splitter = CharacterTextSplitter(\n"," chunk_size=450,\n"," chunk_overlap=0,\n"," separator=' '\n",")\n","r_splitter = RecursiveCharacterTextSplitter(\n"," chunk_size=450,\n"," chunk_overlap=0,\n"," separators=[\"\\n\\n\", \"\\n\", \" \", \"\"]\n",")\n"]},{"cell_type":"code","execution_count":14,"metadata":{},"outputs":[{"data":{"text/plain":["[\"When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.\",\n"," 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the \"backslash n\" you see embedded in this string. Sentences have a period at the end, but also, have a space.and words are separated by space.']"]},"execution_count":14,"metadata":{},"output_type":"execute_result"}],"source":["#分割结果\n","r_splitter.split_text(some_text1)\n"]},{"cell_type":"code","execution_count":15,"metadata":{},"outputs":[{"data":{"text/plain":["['在编写文档时,作者将使用文档结构对内容进行分组。 这可以向读者传达哪些想法是相关的。 例如,密切相关的想法 是在句子中。 类似的想法在段落中。 段落构成文档。 \\n\\n 段落通常用一个或两个回车符分隔。 回车符是您在该字符串中看到的嵌入的“反斜杠 n”。 句子末尾有一个句号,但也有一个空格。 并且单词之间用空格分隔']"]},"execution_count":15,"metadata":{},"output_type":"execute_result"}],"source":["r_splitter.split_text(some_text2)\n"]},{"cell_type":"code","execution_count":16,"metadata":{},"outputs":[{"data":{"text/plain":["[\"When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example,\",\n"," 'closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.',\n"," 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the \"backslash n\" you see embedded in this',\n"," 'string. Sentences have a period at the end, but also, have a space.and words are separated by space.']"]},"execution_count":16,"metadata":{},"output_type":"execute_result"}],"source":["#如果需要按照句子进行分隔,则还要用正则表达式添加一个句号分隔符\n","r_splitter = RecursiveCharacterTextSplitter(\n"," chunk_size=150,\n"," chunk_overlap=0,\n"," separators=[\"\\n\\n\", \"\\n\", \"(?<=\\. )\", \" \", \"\"]\n",")\n","r_splitter.split_text(some_text1)\n"]},{"cell_type":"code","execution_count":17,"metadata":{},"outputs":[{"data":{"text/plain":["['在编写文档时,作者将使用文档结构对内容进行分组。 这可以向读者传达哪些想法是相关的。 例如,密切相关的想法 是在句子中。 类似的想法在段落中。 段落构成文档。',\n"," '段落通常用一个或两个回车符分隔。 回车符是您在该字符串中看到的嵌入的“反斜杠 n”。 句子末尾有一个句号,但也有一个空格。 并且单词之间用空格分隔']"]},"execution_count":17,"metadata":{},"output_type":"execute_result"}],"source":["r_splitter = RecursiveCharacterTextSplitter(\n"," chunk_size=150,\n"," chunk_overlap=0,\n"," separators=[\"\\n\\n\", \"\\n\", \"(?<=\\. )\", \" \", \"\"]\n",")\n","r_splitter.split_text(some_text2)\n"]},{"attachments":{},"cell_type":"markdown","metadata":{},"source":["这就是递归字符文本分割器名字中“递归”的含义,总的来说,我们更建议在通用文本中使用递归字符文本分割器\n"]},{"attachments":{},"cell_type":"markdown","metadata":{},"source":["### 3.2 基于token的分割\n","很多LLM的上下文窗口长度限制是按照Token来计数的。因此,以LLM的视角,按照Token对文本进行分隔,通常可以得到更好的结果。\n","通过一个实例理解基于字符分割和基于Token分割的区别"]},{"cell_type":"code","execution_count":18,"metadata":{},"outputs":[{"data":{"text/plain":["['foo', ' bar', ' b', 'az', 'zy', 'foo']"]},"execution_count":18,"metadata":{},"output_type":"execute_result"}],"source":["# 使用token分割器进行分割,\n","# 将块大小设为1,块重叠大小设为0,相当于将任意字符串分割成了单个Token组成的列\n","from langchain.text_splitter import TokenTextSplitter\n","text_splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)\n","text1 = \"foo bar bazzyfoo\"\n","text_splitter.split_text(text1)\n"]},{"cell_type":"code","execution_count":19,"metadata":{},"outputs":[{"data":{"text/plain":["['test', ' base', 'ces', 'e']"]},"execution_count":19,"metadata":{},"output_type":"execute_result"}],"source":["text1 = \"test basecese\"\n","text_splitter.split_text(text1)\n"]},{"attachments":{},"cell_type":"markdown","metadata":{},"source":["可以看出token长度和字符长度不一样,token通常为4个字符"]},{"attachments":{},"cell_type":"markdown","metadata":{},"source":["### 3.3 分割Markdown文档\n","分块的目的是把具有上下文的文本放在一起,我们可以通过使用指定分隔符来进行分隔,但有些类型的文档(例如 Markdown)本身就具有可用于分割的结构(如标题)。\n","\n","Markdown标题文本分割器会根据标题或子标题来分割一个Markdown文档,并将标题作为元数据添加到每个块中\n"]},{"cell_type":"code","execution_count":20,"metadata":{},"outputs":[],"source":["# 定义一个Markdown文档\n","\n","from langchain.document_loaders import NotionDirectoryLoader#Notion加载器\n","from langchain.text_splitter import MarkdownHeaderTextSplitter#markdown分割器\n","\n","markdown_document = \"\"\"# Title\\n\\n \\\n","## Chapter 1\\n\\n \\\n","Hi this is Jim\\n\\n Hi this is Joe\\n\\n \\\n","### Section \\n\\n \\\n","Hi this is Lance \\n\\n \n","## Chapter 2\\n\\n \\\n","Hi this is Molly\"\"\"\n"]},{"cell_type":"code","execution_count":21,"metadata":{},"outputs":[],"source":["markdown_document = \"\"\"# Title\\n\\n \\\n","## 第一章\\n\\n \\\n","李白乘舟将欲行\\n\\n 忽然岸上踏歌声\\n\\n \\\n","### Section \\n\\n \\\n","桃花潭水深千尺 \\n\\n \n","## 第二章\\n\\n \\\n","不及汪伦送我情\"\"\"\n"]},{"cell_type":"code","execution_count":22,"metadata":{},"outputs":[],"source":["# 定义想要分割的标题列表和名称\n","headers_to_split_on = [\n"," (\"#\", \"Header 1\"),\n"," (\"##\", \"Header 2\"),\n"," (\"###\", \"Header 3\"),\n","]\n"]},{"cell_type":"code","execution_count":23,"metadata":{},"outputs":[{"data":{"text/plain":["Document(page_content='桃花潭水深千尺', metadata={'Header 1': 'Title', 'Header 2': '第一章', 'Header 3': 'Section'})"]},"execution_count":23,"metadata":{},"output_type":"execute_result"}],"source":["# 初始化Markdown标题文本分割器,分割Markdown文档\n","markdown_splitter = MarkdownHeaderTextSplitter(\n"," headers_to_split_on=headers_to_split_on\n",")\n","md_header_splits = markdown_splitter.split_text(markdown_document)\n","\n","md_header_splits[0]\n","# Document(page_content='Hi this is Jim \\nHi this is Joe', metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1'})\n","\n","md_header_splits[1]\n","# Document(page_content='Hi this is Lance', metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1', 'Header 3': 'Section'})\n"]},{"cell_type":"code","execution_count":24,"metadata":{},"outputs":[{"data":{"text/plain":["Document(page_content='李白乘舟将欲行 \\n忽然岸上踏歌声', metadata={'Header 1': 'Title', 'Header 2': '第一章'})"]},"execution_count":24,"metadata":{},"output_type":"execute_result"}],"source":["markdown_splitter = MarkdownHeaderTextSplitter(\n"," headers_to_split_on=headers_to_split_on\n",")#分割文档\n","md_header_splits = markdown_splitter.split_text(markdown_document)\n","#第一个块\n","md_header_splits[0]\n","# Document(page_content='Hi this is Jim \\nHi this is Joe', metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1'})\n","\n","\n"]},{"cell_type":"code","execution_count":25,"metadata":{},"outputs":[{"data":{"text/plain":["Document(page_content='桃花潭水深千尺', metadata={'Header 1': 'Title', 'Header 2': '第一章', 'Header 3': 'Section'})"]},"execution_count":25,"metadata":{},"output_type":"execute_result"}],"source":["#第二个块\n","md_header_splits[1]\n"]},{"attachments":{},"cell_type":"markdown","metadata":{},"source":["可以看到,每个块都包含了页面内容和元数据,元数据中记录了该块所属的标题和子标题。\n","\n","我们已经了解了如何将长文档分割为语义相关的块,并且包含正确的元数据。下一步则是将这些分块后的数据移动到向量存储中,以便进行检索或生成"]},{"cell_type":"code","execution_count":26,"metadata":{},"outputs":[],"source":["#加载数据库的内容\n","loader = NotionDirectoryLoader(\"docs/Notion_DB\")\n","docs = loader.load()\n","txt = ' '.join([d.page_content for d in docs])#拼接文档\n"]},{"cell_type":"code","execution_count":27,"metadata":{},"outputs":[],"source":["headers_to_split_on = [\n"," (\"#\", \"Header 1\"),\n"," (\"##\", \"Header 2\"),\n","]\n","#加载文档分割器\n","markdown_splitter = MarkdownHeaderTextSplitter(\n"," headers_to_split_on=headers_to_split_on\n",")\n"]},{"cell_type":"code","execution_count":28,"metadata":{},"outputs":[],"source":["md_header_splits = markdown_splitter.split_text(txt)#分割文本内容\n"]},{"cell_type":"code","execution_count":29,"metadata":{},"outputs":[{"data":{"text/plain":["Document(page_content=\"Saying goodbye to Blendle (from a colleague) and to a colleague (from Blendle) is a very normal and natural thing. When done right, it can even be a beautiful thing. \\nWe advise you to read the backdrop below first, but feel free to jump in right away with the 'Here's what you can do'-section :). General note: you do not have to do this alone, so please ask for advice and help! \\n- **Backdrop** \\nSaying goodbye to Blendle (from a colleague) and to a colleague (from Blendle) is a very normal and natural thing. When done right, it can even be a beautiful thing. \\nSaying goodbye to people is also an important part of keeping your team on the right track. Firing can even be a part of your [Personnel Planning](https://www.notion.so/Hiring-451bbcfe8d9b49438c0633326bb7af0a?pvs=21). The most common situation will be when you think someone is no longer a good match with Blendle for whatever reason. This doesn't happen overnight, so try to spot situations where this is happening. \\nTo make things clear: HR and Rick and Alex will help you with this, but we want this to be on your agenda. HR will take care of the process and paper- and legal work. Rick and Alex will help with the tough conversations. **The most important work happens months before that and that's where you come in.** \\nWhy? You actually work with your people and see how they are really doing. You have 1on1's, off-sites, projects and feedback sessions together. All the information flows to you as a lead :).\", metadata={'Header 1': 'Firing'})"]},"execution_count":29,"metadata":{},"output_type":"execute_result"}],"source":["md_header_splits[0]#分割结果\n"]}],"metadata":{"kernelspec":{"display_name":"chatGPT","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.9"},"orig_nbformat":4,"vscode":{"interpreter":{"hash":"4d8dc73ac51fd938ce7dec941fbf542c26232b3529b0c2a6ebc607bfa3d5aa69"}}},"nbformat":4,"nbformat_minor":2}
+{"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# \u7b2c\u4e09\u7ae0 \u6587\u6863\u5206\u5272", "\n", " - [\u4e00\u3001\u914d\u7f6e\u73af\u5883](#\u4e00\u3001\u914d\u7f6e\u73af\u5883)\n", " - [\u4e8c\u3001\u6587\u6863\u5206\u5272\u91cd\u8981\u6027](#\u4e8c\u3001\u6587\u6863\u5206\u5272\u91cd\u8981\u6027)\n", " - [\u4e09\u3001\u6587\u6863\u5206\u5272\u65b9\u5f0f](#\u4e09\u3001\u6587\u6863\u5206\u5272\u65b9\u5f0f)\n", " - [3.1 \u57fa\u4e8e\u5b57\u7b26\u7684\u5206\u5272\u65b9\u5f0f](#3.1-\u57fa\u4e8e\u5b57\u7b26\u7684\u5206\u5272\u65b9\u5f0f)\n", " - [3.2 \u57fa\u4e8etoken\u7684\u5206\u5272](#3.2-\u57fa\u4e8etoken\u7684\u5206\u5272)\n", " - [3.3 \u5206\u5272Markdown\u6587\u6863](#3.3-\u5206\u5272Markdown\u6587\u6863)\n"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u6211\u4eec\u521a\u521a\u8ba8\u8bba\u4e86\u5982\u4f55\u5c06\u6587\u6863\u52a0\u8f7d\u5230\u6807\u51c6\u683c\u5f0f\u4e2d\uff0c\u73b0\u5728\u6211\u4eec\u8981\u8c08\u8bba\u5982\u4f55\u5c06\u5b83\u4eec\u5206\u5272\u6210\u8f83\u5c0f\u7684\u5757\u3002\u8fd9\u542c\u8d77\u6765\u53ef\u80fd\u5f88\u7b80\u5355\uff0c\u4f46\u5176\u4e2d\u6709\u5f88\u591a\u5fae\u5999\u4e4b\u5904\u4f1a\u5bf9\u540e\u7eed\u5de5\u4f5c\u4ea7\u751f\u91cd\u8981\u5f71\u54cd\u3002"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## \u4e00\u3001\u914d\u7f6e\u73af\u5883"]}, {"cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": ["import os\n", "import openai\n", "import sys\n", "sys.path.append('../..')\n", "\n", "from dotenv import load_dotenv, find_dotenv\n", "_ = load_dotenv(find_dotenv()) \n", "\n", "openai.api_key = os.environ['OPENAI_API_KEY']"]}, {"attachments": {"image.png": {"image/png": ""}}, "cell_type": "markdown", "metadata": {}, "source": ["## \u4e8c\u3001\u6587\u6863\u5206\u5272\u91cd\u8981\u6027\n", "\u6570\u636e\u52a0\u8f7d\u5b8c\u6bd5\u540e\u8fdb\u884c\u5206\u5272\uff0c\u5206\u5272\u5b8c\u6bd5\u540e\u653e\u5165\u5411\u91cf\u6570\u636e\u5e93\uff0c\u8fdb\u884c\u5411\u91cf\u5316\n", "\n", "\n", "\n", "\u5982\u679c\u76f4\u63a5\u6309\u7167\u5355\u4e2a\u5b57\u7b26\u8fdb\u884c\u5206\u5272\uff0c\u4f1a\u4e22\u5931\u6587\u672c\u7684\u8bed\u4e49\u4fe1\u606f\uff0c\u4ece\u800c\u5bfc\u81f4\u65e0\u6cd5\u6b63\u786e\u7684\u56de\u7b54\u95ee\u9898\u3002\u56e0\u6b64\uff0c\u9700\u8981\u5c3d\u91cf\u4fdd\u6301\u8bed\u4e49\u7684\u5b8c\u6574\u6027\uff0c\u5206\u5272\u51fa\u5177\u6709\u8bed\u4e49\u7684\u5757\u3002"]}, {"attachments": {"image-2.png": {"image/png": ""}, "image.png": {"image/png": ""}}, "cell_type": "markdown", "metadata": {}, "source": ["## \u4e09\u3001\u6587\u6863\u5206\u5272\u65b9\u5f0f\n", "Langchain\u4e2d\u6587\u672c\u5206\u5272\u5668\u90fd\u6839\u636echunk_size(\u5757\u5927\u5c0f)\u548cchunk_overlap(\u5757\u4e0e\u5757\u4e4b\u95f4\u7684\u91cd\u53e0\u5927\u5c0f)\u8fdb\u884c\u5206\u5272\u3002\n", "\n", "\n", "\n", "* chunk_size\u6307\u6bcf\u4e2a\u5757\u5305\u542b\u7684\u5b57\u7b26\u6216Token\uff08\u5982\u5355\u8bcd\u3001\u53e5\u5b50\u7b49\uff09\u7684\u6570\u91cf\n", "\n", "* chunk_overlap\u6307\u4e24\u4e2a\u5757\u4e4b\u95f4\u5171\u4eab\u7684\u5b57\u7b26\u6570\u91cf\uff0c\u7528\u4e8e\u4fdd\u6301\u4e0a\u4e0b\u6587\u7684\u8fde\u8d2f\u6027\uff0c\u907f\u514d\u5206\u5272\u4e22\u5931\u4e0a\u4e0b\u6587\u4fe1\u606f\n", "\n", "\n", "\n", "Langchain\u63d0\u4f9b\u591a\u79cd\u6587\u6863\u5206\u5272\u65b9\u5f0f\uff0c\u533a\u522b\u5728\u600e\u4e48\u786e\u5b9a\u5757\u4e0e\u5757\u4e4b\u95f4\u7684\u8fb9\u754c\u3001\u5757\u7531\u54ea\u4e9b\u5b57\u7b26/token\u7ec4\u6210\u3001\u4ee5\u53ca\u5982\u4f55\u6d4b\u91cf\u5757\u5927\u5c0f"]}, {"attachments": {}, "cell_type": "markdown", "metadata": {}, "source": ["### 3.1 \u57fa\u4e8e\u5b57\u7b26\u7684\u5206\u5272\u65b9\u5f0f\n", "\u600e\u4e48\u5206\u5272\u5757\u53d6\u51b3\u4e8e\u6211\u4eec\u7684\u4efb\u52a1\u7c7b\u578b\uff0c\u5f53\u60a8\u62c6\u5206\u4ee3\u7801\u65f6\uff0c\u8fd9\u4e00\u70b9\u975e\u5e38\u660e\u663e\u3002\u6240\u4ee5\u6211\u4eec\u6709\u4e00\u4e2a\u8bed\u8a00\u6587\u672c\u5206\u5272\u5668\uff0c\u5b83\u6709\u4e00\u5806\u4e0d\u540c\u7684\u5206\u9694\u7b26\uff0c\u7528\u4e8e\u5404\u79cdPython\u3001Ruby\u3001C \u7b49\u4e0d\u540c\u8bed\u8a00\u7684\u7248\u672c\u3002\u5728\u62c6\u5206\u8fd9\u4e9b\u6587\u6863\u65f6\uff0c\u8003\u8651\u8fd9\u4e9b\u4e0d\u540c\u7684\u8bed\u8a00\u7684\u5dee\u5f02\u6027\u3002"]}, {"cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": ["\n", "''' \n", "* RecursiveCharacterTextSplitter \u9012\u5f52\u5b57\u7b26\u6587\u672c\u5206\u5272\n", "* CharacterTextSplitter \u5b57\u7b26\u5206\u5272\n", "RecursiveCharacterTextSplitter \u5c06\u6309\u4e0d\u540c\u7684\u5b57\u7b26\u9012\u5f52\u5730\u5206\u5272(\u6309\u7167\u8fd9\u4e2a\u4f18\u5148\u7ea7[\"\\n\\n\", \"\\n\", \" \", \"\"])\uff0c\u8fd9\u6837\u5c31\u80fd\u5c3d\u91cf\u628a\u6240\u6709\u548c\u8bed\u4e49\u76f8\u5173\u7684\u5185\u5bb9\u5c3d\u53ef\u80fd\u957f\u65f6\u95f4\u5730\u4fdd\u7559\u5728\u540c\u4e00\u4f4d\u7f6e\n", "RecursiveCharacterTextSplitter\u9700\u8981\u5173\u6ce8\u7684\u662f4\u4e2a\u53c2\u6570\uff1a\n", "\n", "* separators - \u5206\u9694\u7b26\u5b57\u7b26\u4e32\u6570\u7ec4\n", "* chunk_size - \u6bcf\u4e2a\u6587\u6863\u7684\u5b57\u7b26\u6570\u91cf\u9650\u5236\n", "* chunk_overlap - \u4e24\u4efd\u6587\u6863\u91cd\u53e0\u533a\u57df\u7684\u957f\u5ea6\n", "* length_function - \u957f\u5ea6\u8ba1\u7b97\u51fd\u6570\n", "CharacterTextSplitter \u5b57\u7b26\u5206\u5272\n", "\u5206\u9694\u7b26\u7684\u53c2\u6570\u662f\u5355\u4e2a\u7684\u5b57\u7b26\u4e32\u3002\u8fd9\u5c31\u4f1a\u4f7f\u5f97RecursiveCharacterTextSplitter\u6bd4CharacterTextSplitter\u5bf9\u6587\u6863\u5207\u5272\u5f97\u66f4\u52a0\u788e\u7247\u5316\n", "'''\n", "#\u5bfc\u5165\u6587\u672c\u5206\u5272\u5668\n", "from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter\n"]}, {"cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": ["chunk_size = 26 #\u8bbe\u7f6e\u5757\u5927\u5c0f\n", "chunk_overlap = 4 #\u8bbe\u7f6e\u5757\u91cd\u53e0\u5927\u5c0f\n"]}, {"cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": ["#\u521d\u59cb\u5316\u6587\u672c\u5206\u5272\u5668\n", "r_splitter = RecursiveCharacterTextSplitter(\n", " chunk_size=chunk_size,\n", " chunk_overlap=chunk_overlap\n", ")\n", "c_splitter = CharacterTextSplitter(\n", " chunk_size=chunk_size,\n", " chunk_overlap=chunk_overlap\n", ")\n"]}, {"cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [{"data": {"text/plain": ["['abcdefghijklmnopqrstuvwxyz', 'wxyzabcdefg']"]}, "execution_count": 15, "metadata": {}, "output_type": "execute_result"}], "source": ["#\u4f7f\u7528\u9012\u5f52\u5b57\u7b26\u6587\u672c\u5206\u5272\u5668\n", "text2 = \"abcdefghijklmnopqrstuvwxyzabcdefg\"\n", "r_splitter.split_text(text2)\n"]}, {"cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [{"data": {"text/plain": ["['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']"]}, "execution_count": 18, "metadata": {}, "output_type": "execute_result"}], "source": ["text3 = \"a b c d e f g h i j k l m n o p q r s t u v w x y z\"#\u6d4b\u8bd5\u6587\u672c\n", "r_splitter.split_text(text3)\n"]}, {"attachments": {}, "cell_type": "markdown", "metadata": {}, "source": ["\u7b2c\u4e8c\u5757\u662f\u4ece'wxyz'\u5f00\u59cb\u7684\uff0c\u521a\u597d\u662f\u6211\u4eec\u8bbe\u5b9a\u7684\u5757\u91cd\u53e0\u5927\u5c0f"]}, {"cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [{"data": {"text/plain": ["['a b c d e f g h i j k l m n o p q r s t u v w x y z']"]}, "execution_count": 19, "metadata": {}, "output_type": "execute_result"}], "source": ["#\u5b57\u7b26\u6587\u672c\u5206\u5272\u5668\n", "c_splitter.split_text(text3)\n"]}, {"attachments": {}, "cell_type": "markdown", "metadata": {}, "source": ["\u53ef\u4ee5\u770b\u5230\u5b57\u7b26\u5206\u5272\u5668\u6ca1\u6709\u5206\u5272\u8fd9\u4e2a\u6587\u672c\uff0c\u56e0\u4e3a\u5b57\u7b26\u6587\u672c\u5206\u5272\u5668\u9ed8\u8ba4\u4ee5\u6362\u884c\u7b26\u4e3a\u5206\u9694\u7b26\uff0c\u56e0\u6b64\u9700\u8981\u8bbe\u7f6e\u7a7a\u683c\u4e3a\u5206\u9694\u7b26\u3002"]}, {"cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [{"data": {"text/plain": ["['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']"]}, "execution_count": 21, "metadata": {}, "output_type": "execute_result"}], "source": ["# \u8bbe\u7f6e\u7a7a\u683c\u5206\u9694\u7b26\n", "c_splitter = CharacterTextSplitter(\n", " chunk_size=chunk_size,\n", " chunk_overlap=chunk_overlap,\n", " separator=' '\n", ")\n", "c_splitter.split_text(text3)\n"]}, {"cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": ["# \u9012\u5f52\u5206\u5272\u957f\u6bb5\u843d\n", "some_text1 = \"\"\"When writing documents, writers will use document structure to group content. \\\n", "This can convey to the reader, which idea's are related. For example, closely related ideas \\\n", "are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \\n\\n \\\n", "Paragraphs are often delimited with a carriage return or two carriage returns. \\\n", "Carriage returns are the \"backslash n\" you see embedded in this string. \\\n", "Sentences have a period at the end, but also, have a space.\\\n", "and words are separated by space.\"\"\"\n"]}, {"cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [{"data": {"text/plain": ["496"]}, "execution_count": 38, "metadata": {}, "output_type": "execute_result"}], "source": ["len(some_text1)\n"]}, {"cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": ["# \u4e2d\u6587\u7248\n", "some_text2 = \"\"\"\u5728\u7f16\u5199\u6587\u6863\u65f6\uff0c\u4f5c\u8005\u5c06\u4f7f\u7528\u6587\u6863\u7ed3\u6784\u5bf9\u5185\u5bb9\u8fdb\u884c\u5206\u7ec4\u3002 \\\n", " \u8fd9\u53ef\u4ee5\u5411\u8bfb\u8005\u4f20\u8fbe\u54ea\u4e9b\u60f3\u6cd5\u662f\u76f8\u5173\u7684\u3002 \u4f8b\u5982\uff0c\u5bc6\u5207\u76f8\u5173\u7684\u60f3\u6cd5\\\n", " \u662f\u5728\u53e5\u5b50\u4e2d\u3002 \u7c7b\u4f3c\u7684\u60f3\u6cd5\u5728\u6bb5\u843d\u4e2d\u3002 \u6bb5\u843d\u6784\u6210\u6587\u6863\u3002 \\n\\n\\\n", " \u6bb5\u843d\u901a\u5e38\u7528\u4e00\u4e2a\u6216\u4e24\u4e2a\u56de\u8f66\u7b26\u5206\u9694\u3002 \\\n", " \u56de\u8f66\u7b26\u662f\u60a8\u5728\u8be5\u5b57\u7b26\u4e32\u4e2d\u770b\u5230\u7684\u5d4c\u5165\u7684\u201c\u53cd\u659c\u6760 n\u201d\u3002 \\\n", " \u53e5\u5b50\u672b\u5c3e\u6709\u4e00\u4e2a\u53e5\u53f7\uff0c\u4f46\u4e5f\u6709\u4e00\u4e2a\u7a7a\u683c\u3002\\\n", " \u5e76\u4e14\u5355\u8bcd\u4e4b\u95f4\u7528\u7a7a\u683c\u5206\u9694\"\"\"\n"]}, {"cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [{"data": {"text/plain": ["177"]}, "execution_count": 40, "metadata": {}, "output_type": "execute_result"}], "source": ["len(some_text2)\n"]}, {"cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": ["''' \n", "\u4f9d\u6b21\u4f20\u5165\u5206\u9694\u7b26\u5217\u8868\uff0c\u5206\u522b\u662f\u53cc\u6362\u884c\u7b26\u3001\u5355\u6362\u884c\u7b26\u3001\u7a7a\u683c\u3001\u7a7a\u5b57\u7b26\uff0c\n", "\u56e0\u6b64\u5728\u5206\u5272\u6587\u672c\u65f6\uff0c\u9996\u5148\u4f1a\u91c7\u7528\u53cc\u5206\u6362\u884c\u7b26\u8fdb\u884c\u5206\u5272\uff0c\u540c\u65f6\u4f9d\u6b21\u4f7f\u7528\u5176\u4ed6\u5206\u9694\u7b26\u8fdb\u884c\u5206\u5272\n", "'''\n", "\n", "c_splitter = CharacterTextSplitter(\n", " chunk_size=450,\n", " chunk_overlap=0,\n", " separator=' '\n", ")\n", "r_splitter = RecursiveCharacterTextSplitter(\n", " chunk_size=450,\n", " chunk_overlap=0,\n", " separators=[\"\\n\\n\", \"\\n\", \" \", \"\"]\n", ")\n"]}, {"cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [{"data": {"text/plain": ["[\"When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.\",\n", " 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the \"backslash n\" you see embedded in this string. Sentences have a period at the end, but also, have a space.and words are separated by space.']"]}, "execution_count": 43, "metadata": {}, "output_type": "execute_result"}], "source": ["#\u5206\u5272\u7ed3\u679c\n", "r_splitter.split_text(some_text1)\n"]}, {"cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [{"data": {"text/plain": ["['\u5728\u7f16\u5199\u6587\u6863\u65f6\uff0c\u4f5c\u8005\u5c06\u4f7f\u7528\u6587\u6863\u7ed3\u6784\u5bf9\u5185\u5bb9\u8fdb\u884c\u5206\u7ec4\u3002 \u8fd9\u53ef\u4ee5\u5411\u8bfb\u8005\u4f20\u8fbe\u54ea\u4e9b\u60f3\u6cd5\u662f\u76f8\u5173\u7684\u3002 \u4f8b\u5982\uff0c\u5bc6\u5207\u76f8\u5173\u7684\u60f3\u6cd5 \u662f\u5728\u53e5\u5b50\u4e2d\u3002 \u7c7b\u4f3c\u7684\u60f3\u6cd5\u5728\u6bb5\u843d\u4e2d\u3002 \u6bb5\u843d\u6784\u6210\u6587\u6863\u3002 \\n\\n \u6bb5\u843d\u901a\u5e38\u7528\u4e00\u4e2a\u6216\u4e24\u4e2a\u56de\u8f66\u7b26\u5206\u9694\u3002 \u56de\u8f66\u7b26\u662f\u60a8\u5728\u8be5\u5b57\u7b26\u4e32\u4e2d\u770b\u5230\u7684\u5d4c\u5165\u7684\u201c\u53cd\u659c\u6760 n\u201d\u3002 \u53e5\u5b50\u672b\u5c3e\u6709\u4e00\u4e2a\u53e5\u53f7\uff0c\u4f46\u4e5f\u6709\u4e00\u4e2a\u7a7a\u683c\u3002 \u5e76\u4e14\u5355\u8bcd\u4e4b\u95f4\u7528\u7a7a\u683c\u5206\u9694']"]}, "execution_count": 44, "metadata": {}, "output_type": "execute_result"}], "source": ["r_splitter.split_text(some_text2)\n"]}, {"cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [{"data": {"text/plain": ["[\"When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related.\",\n", " 'For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.',\n", " 'Paragraphs are often delimited with a carriage return or two carriage returns.',\n", " 'Carriage returns are the \"backslash n\" you see embedded in this string.',\n", " 'Sentences have a period at the end, but also, have a space.and words are separated by space.']"]}, "execution_count": 45, "metadata": {}, "output_type": "execute_result"}], "source": ["#\u5982\u679c\u9700\u8981\u6309\u7167\u53e5\u5b50\u8fdb\u884c\u5206\u9694\uff0c\u5219\u8fd8\u8981\u7528\u6b63\u5219\u8868\u8fbe\u5f0f\u6dfb\u52a0\u4e00\u4e2a\u53e5\u53f7\u5206\u9694\u7b26\n", "r_splitter = RecursiveCharacterTextSplitter(\n", " chunk_size=150,\n", " chunk_overlap=0,\n", " separators=[\"\\n\\n\", \"\\n\", \"(?<=\\. )\", \" \", \"\"]\n", ")\n", "r_splitter.split_text(some_text1)\n"]}, {"cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [{"data": {"text/plain": ["['\u5728\u7f16\u5199\u6587\u6863\u65f6\uff0c\u4f5c\u8005\u5c06\u4f7f\u7528\u6587\u6863\u7ed3\u6784\u5bf9\u5185\u5bb9\u8fdb\u884c\u5206\u7ec4\u3002 \u8fd9\u53ef\u4ee5\u5411\u8bfb\u8005\u4f20\u8fbe\u54ea\u4e9b\u60f3\u6cd5\u662f\u76f8\u5173\u7684\u3002 \u4f8b\u5982\uff0c\u5bc6\u5207\u76f8\u5173\u7684\u60f3\u6cd5 \u662f\u5728\u53e5\u5b50\u4e2d\u3002 \u7c7b\u4f3c\u7684\u60f3\u6cd5\u5728\u6bb5\u843d\u4e2d\u3002 \u6bb5\u843d\u6784\u6210\u6587\u6863\u3002',\n", " '\u6bb5\u843d\u901a\u5e38\u7528\u4e00\u4e2a\u6216\u4e24\u4e2a\u56de\u8f66\u7b26\u5206\u9694\u3002 \u56de\u8f66\u7b26\u662f\u60a8\u5728\u8be5\u5b57\u7b26\u4e32\u4e2d\u770b\u5230\u7684\u5d4c\u5165\u7684\u201c\u53cd\u659c\u6760 n\u201d\u3002 \u53e5\u5b50\u672b\u5c3e\u6709\u4e00\u4e2a\u53e5\u53f7\uff0c\u4f46\u4e5f\u6709\u4e00\u4e2a\u7a7a\u683c\u3002 \u5e76\u4e14\u5355\u8bcd\u4e4b\u95f4\u7528\u7a7a\u683c\u5206\u9694']"]}, "execution_count": 47, "metadata": {}, "output_type": "execute_result"}], "source": ["r_splitter = RecursiveCharacterTextSplitter(\n", " chunk_size=150,\n", " chunk_overlap=0,\n", " separators=[\"\\n\\n\", \"\\n\", \"(?<=\\. )\", \" \", \"\"]\n", ")\n", "r_splitter.split_text(some_text2)\n"]}, {"attachments": {}, "cell_type": "markdown", "metadata": {}, "source": ["\u8fd9\u5c31\u662f\u9012\u5f52\u5b57\u7b26\u6587\u672c\u5206\u5272\u5668\u540d\u5b57\u4e2d\u201c\u9012\u5f52\u201d\u7684\u542b\u4e49\uff0c\u603b\u7684\u6765\u8bf4\uff0c\u6211\u4eec\u66f4\u5efa\u8bae\u5728\u901a\u7528\u6587\u672c\u4e2d\u4f7f\u7528\u9012\u5f52\u5b57\u7b26\u6587\u672c\u5206\u5272\u5668\n"]}, {"attachments": {}, "cell_type": "markdown", "metadata": {}, "source": ["### 3.2 \u57fa\u4e8etoken\u7684\u5206\u5272\n", "\u5f88\u591aLLM\u7684\u4e0a\u4e0b\u6587\u7a97\u53e3\u957f\u5ea6\u9650\u5236\u662f\u6309\u7167Token\u6765\u8ba1\u6570\u7684\u3002\u56e0\u6b64\uff0c\u4ee5LLM\u7684\u89c6\u89d2\uff0c\u6309\u7167Token\u5bf9\u6587\u672c\u8fdb\u884c\u5206\u9694\uff0c\u901a\u5e38\u53ef\u4ee5\u5f97\u5230\u66f4\u597d\u7684\u7ed3\u679c\u3002\n", "\u901a\u8fc7\u4e00\u4e2a\u5b9e\u4f8b\u7406\u89e3\u57fa\u4e8e\u5b57\u7b26\u5206\u5272\u548c\u57fa\u4e8eToken\u5206\u5272\u7684\u533a\u522b"]}, {"cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [{"data": {"text/plain": ["['foo', ' bar', ' b', 'az', 'zy', 'foo']"]}, "execution_count": 50, "metadata": {}, "output_type": "execute_result"}], "source": ["# \u4f7f\u7528token\u5206\u5272\u5668\u8fdb\u884c\u5206\u5272\uff0c\n", "# \u5c06\u5757\u5927\u5c0f\u8bbe\u4e3a1\uff0c\u5757\u91cd\u53e0\u5927\u5c0f\u8bbe\u4e3a0\uff0c\u76f8\u5f53\u4e8e\u5c06\u4efb\u610f\u5b57\u7b26\u4e32\u5206\u5272\u6210\u4e86\u5355\u4e2aToken\u7ec4\u6210\u7684\u5217\n", "from langchain.text_splitter import TokenTextSplitter\n", "text_splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)\n", "text1 = \"foo bar bazzyfoo\"\n", "text_splitter.split_text(text1)\n"]}, {"cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [{"data": {"text/plain": ["['test', ' base', 'ces', 'e']"]}, "execution_count": 55, "metadata": {}, "output_type": "execute_result"}], "source": ["text1 = \"test basecese\"\n", "text_splitter.split_text(text1)\n"]}, {"attachments": {}, "cell_type": "markdown", "metadata": {}, "source": ["\u53ef\u4ee5\u770b\u51fatoken\u957f\u5ea6\u548c\u5b57\u7b26\u957f\u5ea6\u4e0d\u4e00\u6837\uff0ctoken\u901a\u5e38\u4e3a4\u4e2a\u5b57\u7b26"]}, {"attachments": {}, "cell_type": "markdown", "metadata": {}, "source": ["### 3.3 \u5206\u5272Markdown\u6587\u6863\n", "\u5206\u5757\u7684\u76ee\u7684\u662f\u628a\u5177\u6709\u4e0a\u4e0b\u6587\u7684\u6587\u672c\u653e\u5728\u4e00\u8d77\uff0c\u6211\u4eec\u53ef\u4ee5\u901a\u8fc7\u4f7f\u7528\u6307\u5b9a\u5206\u9694\u7b26\u6765\u8fdb\u884c\u5206\u9694\uff0c\u4f46\u6709\u4e9b\u7c7b\u578b\u7684\u6587\u6863\uff08\u4f8b\u5982 Markdown\uff09\u672c\u8eab\u5c31\u5177\u6709\u53ef\u7528\u4e8e\u5206\u5272\u7684\u7ed3\u6784\uff08\u5982\u6807\u9898\uff09\u3002\n", "\n", "Markdown\u6807\u9898\u6587\u672c\u5206\u5272\u5668\u4f1a\u6839\u636e\u6807\u9898\u6216\u5b50\u6807\u9898\u6765\u5206\u5272\u4e00\u4e2aMarkdown\u6587\u6863\uff0c\u5e76\u5c06\u6807\u9898\u4f5c\u4e3a\u5143\u6570\u636e\u6dfb\u52a0\u5230\u6bcf\u4e2a\u5757\u4e2d\n"]}, {"cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": ["# \u5b9a\u4e49\u4e00\u4e2aMarkdown\u6587\u6863\n", "\n", "from langchain.document_loaders import NotionDirectoryLoader#Notion\u52a0\u8f7d\u5668\n", "from langchain.text_splitter import MarkdownHeaderTextSplitter#markdown\u5206\u5272\u5668\n", "\n", "markdown_document = \"\"\"# Title\\n\\n \\\n", "## Chapter 1\\n\\n \\\n", "Hi this is Jim\\n\\n Hi this is Joe\\n\\n \\\n", "### Section \\n\\n \\\n", "Hi this is Lance \\n\\n \n", "## Chapter 2\\n\\n \\\n", "Hi this is Molly\"\"\"\n"]}, {"cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": ["markdown_document = \"\"\"# Title\\n\\n \\\n", "## \u7b2c\u4e00\u7ae0\\n\\n \\\n", "\u674e\u767d\u4e58\u821f\u5c06\u6b32\u884c\\n\\n \u5ffd\u7136\u5cb8\u4e0a\u8e0f\u6b4c\u58f0\\n\\n \\\n", "### Section \\n\\n \\\n", "\u6843\u82b1\u6f6d\u6c34\u6df1\u5343\u5c3a \\n\\n \n", "## \u7b2c\u4e8c\u7ae0\\n\\n \\\n", "\u4e0d\u53ca\u6c6a\u4f26\u9001\u6211\u60c5\"\"\"\n"]}, {"cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": ["# \u5b9a\u4e49\u60f3\u8981\u5206\u5272\u7684\u6807\u9898\u5217\u8868\u548c\u540d\u79f0\n", "headers_to_split_on = [\n", " (\"#\", \"Header 1\"),\n", " (\"##\", \"Header 2\"),\n", " (\"###\", \"Header 3\"),\n", "]\n"]}, {"cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [{"data": {"text/plain": ["Document(page_content='Hi this is Lance', metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1', 'Header 3': 'Section'})"]}, "execution_count": 60, "metadata": {}, "output_type": "execute_result"}], "source": ["# \u521d\u59cb\u5316Markdown\u6807\u9898\u6587\u672c\u5206\u5272\u5668\uff0c\u5206\u5272Markdown\u6587\u6863\n", "markdown_splitter = MarkdownHeaderTextSplitter(\n", " headers_to_split_on=headers_to_split_on\n", ")\n", "md_header_splits = markdown_splitter.split_text(markdown_document)\n", "\n", "md_header_splits[0]\n", "# Document(page_content='Hi this is Jim \\nHi this is Joe', metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1'})\n", "\n", "md_header_splits[1]\n", "# Document(page_content='Hi this is Lance', metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1', 'Header 3': 'Section'})\n"]}, {"cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [{"data": {"text/plain": ["Document(page_content='\u674e\u767d\u4e58\u821f\u5c06\u6b32\u884c \\n\u5ffd\u7136\u5cb8\u4e0a\u8e0f\u6b4c\u58f0 \\n### Section \\n\u6843\u82b1\u6f6d\u6c34\u6df1\u5343\u5c3a', metadata={'Header 1': 'Title', 'Header 2': '\u7b2c\u4e00\u7ae0'})"]}, "execution_count": 68, "metadata": {}, "output_type": "execute_result"}], "source": ["markdown_splitter = MarkdownHeaderTextSplitter(\n", " headers_to_split_on=headers_to_split_on\n", ")#\u5206\u5272\u6587\u6863\n", "md_header_splits = markdown_splitter.split_text(markdown_document)\n", "#\u7b2c\u4e00\u4e2a\u5757\n", "md_header_splits[0]\n", "# Document(page_content='Hi this is Jim \\nHi this is Joe', metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1'})\n", "\n", "\n"]}, {"cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [{"data": {"text/plain": ["Document(page_content='\u4e0d\u53ca\u6c6a\u4f26\u9001\u6211\u60c5', metadata={'Header 1': 'Title', 'Header 2': '\u7b2c\u4e8c\u7ae0'})"]}, "execution_count": 71, "metadata": {}, "output_type": "execute_result"}], "source": ["#\u7b2c\u4e8c\u4e2a\u5757\n", "md_header_splits[1]\n"]}, {"attachments": {}, "cell_type": "markdown", "metadata": {}, "source": ["\u53ef\u4ee5\u770b\u5230\uff0c\u6bcf\u4e2a\u5757\u90fd\u5305\u542b\u4e86\u9875\u9762\u5185\u5bb9\u548c\u5143\u6570\u636e\uff0c\u5143\u6570\u636e\u4e2d\u8bb0\u5f55\u4e86\u8be5\u5757\u6240\u5c5e\u7684\u6807\u9898\u548c\u5b50\u6807\u9898\u3002\n", "\n", "\u6211\u4eec\u5df2\u7ecf\u4e86\u89e3\u4e86\u5982\u4f55\u5c06\u957f\u6587\u6863\u5206\u5272\u4e3a\u8bed\u4e49\u76f8\u5173\u7684\u5757\uff0c\u5e76\u4e14\u5305\u542b\u6b63\u786e\u7684\u5143\u6570\u636e\u3002\u4e0b\u4e00\u6b65\u5219\u662f\u5c06\u8fd9\u4e9b\u5206\u5757\u540e\u7684\u6570\u636e\u79fb\u52a8\u5230\u5411\u91cf\u5b58\u50a8\u4e2d\uff0c\u4ee5\u4fbf\u8fdb\u884c\u68c0\u7d22\u6216\u751f\u6210"]}, {"cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [], "source": ["#\u52a0\u8f7d\u6570\u636e\u5e93\u7684\u5185\u5bb9\n", "loader = NotionDirectoryLoader(\"docs/Notion_DB\")\n", "docs = loader.load()\n", "txt = ' '.join([d.page_content for d in docs])#\u62fc\u63a5\u6587\u6863\n"]}, {"cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": ["headers_to_split_on = [\n", " (\"#\", \"Header 1\"),\n", " (\"##\", \"Header 2\"),\n", "]\n", "#\u52a0\u8f7d\u6587\u6863\u5206\u5272\u5668\n", "markdown_splitter = MarkdownHeaderTextSplitter(\n", " headers_to_split_on=headers_to_split_on\n", ")\n"]}, {"cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [], "source": ["md_header_splits = markdown_splitter.split_text(txt)#\u5206\u5272\u6587\u672c\u5185\u5bb9\n"]}, {"cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [{"data": {"text/plain": ["Document(page_content=\"Saying goodbye to Blendle (from a colleague) and to a colleague (from Blendle) is a very normal and natural thing. When done right, it can even be a beautiful thing. \\nWe advise you to read the backdrop below first, but feel free to jump in right away with the 'Here's what you can do'-section :). General note: you do not have to do this alone, so please ask for advice and help! \\n- **Backdrop** \\nSaying goodbye to Blendle (from a colleague) and to a colleague (from Blendle) is a very normal and natural thing. When done right, it can even be a beautiful thing. \\nSaying goodbye to people is also an important part of keeping your team on the right track. Firing can even be a part of your [Personnel Planning](https://www.notion.so/Hiring-451bbcfe8d9b49438c0633326bb7af0a?pvs=21). The most common situation will be when you think someone is no longer a good match with Blendle for whatever reason. This doesn't happen overnight, so try to spot situations where this is happening. \\nTo make things clear: HR and Rick and Alex will help you with this, but we want this to be on your agenda. HR will take care of the process and paper- and legal work. Rick and Alex will help with the tough conversations. **The most important work happens months before that and that's where you come in.** \\nWhy? You actually work with your people and see how they are really doing. You have 1on1's, off-sites, projects and feedback sessions together. All the information flows to you as a lead :).\", metadata={'Header 1': 'Firing'})"]}, "execution_count": 75, "metadata": {}, "output_type": "execute_result"}], "source": ["md_header_splits[0]#\u5206\u5272\u7ed3\u679c\n"]}], "metadata": {"kernelspec": {"display_name": "chatGPT", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16"}, "orig_nbformat": 4, "vscode": {"interpreter": {"hash": "4d8dc73ac51fd938ce7dec941fbf542c26232b3529b0c2a6ebc607bfa3d5aa69"}}}, "nbformat": 4, "nbformat_minor": 2}
\ No newline at end of file
diff --git "a/content/\345\277\205\344\277\256\345\233\233-LangChain Chat with Your Data/4.\345\220\221\351\207\217\346\225\260\346\215\256\345\272\223\344\270\216\350\257\215\345\220\221\351\207\217 Vectorstores and Embeddings.ipynb" "b/content/\345\277\205\344\277\256\345\233\233-LangChain Chat with Your Data/4.\345\220\221\351\207\217\346\225\260\346\215\256\345\272\223\344\270\216\350\257\215\345\220\221\351\207\217 Vectorstores and Embeddings.ipynb"
index 50957a82..dcae806e 100644
--- "a/content/\345\277\205\344\277\256\345\233\233-LangChain Chat with Your Data/4.\345\220\221\351\207\217\346\225\260\346\215\256\345\272\223\344\270\216\350\257\215\345\220\221\351\207\217 Vectorstores and Embeddings.ipynb"
+++ "b/content/\345\277\205\344\277\256\345\233\233-LangChain Chat with Your Data/4.\345\220\221\351\207\217\346\225\260\346\215\256\345\272\223\344\270\216\350\257\215\345\220\221\351\207\217 Vectorstores and Embeddings.ipynb"
@@ -1 +1 @@
-{"cells":[{"cell_type":"markdown","id":"0d647f70","metadata":{},"source":["# 第四章 向量数据库与词向量(Vectorstores and Embeddings)\n","\n"," - [一、环境配置](#一、环境配置)\n"," - [二、读取文档](#二、读取文档)\n"," - [三、Embeddings](#三、Embeddings)\n"," - [四、Vectorstores](#四、Vectorstores)\n"," - [4.1 初始化Chroma](#4.1-初始化Chroma)\n"," - [4.2 相似性搜索(Similarity Search)](#4.2-相似性搜索(Similarity-Search))\n"," - [五、失败的情况(Failure modes)](#五、失败的情况(Failure-modes))\n"]},{"cell_type":"markdown","id":"ea9da9ec","metadata":{},"source":["回顾一下检索增强生成(RAG)的整体工作流程:"]},{"attachments":{"overview.jpeg":{"image/jpeg":""}},"cell_type":"markdown","id":"125247c2","metadata":{},"source":[""]},{"cell_type":"markdown","id":"88e5ba0b","metadata":{},"source":["## 一、环境配置"]},{"cell_type":"markdown","id":"f3d033b6","metadata":{},"source":["在当前文件夹下新建`.env`文件,内容为`OPENAI_API_KEY = \"sk-...\"`\n","\n","由于本章节需要使用`PyPDFLoader`、`Chroma`,故需要安装依赖包`pypdf`、`chromadb`"]},{"cell_type":"code","execution_count":1,"id":"7f8d2266-4a35-4904-ae9d-c89790c5ae61","metadata":{"height":166,"tags":[]},"outputs":[],"source":["import os\n","import openai\n","import sys\n","sys.path.append('../..')\n","\n","from dotenv import load_dotenv, find_dotenv\n","_ = load_dotenv(find_dotenv()) # 读取本地.env文件\n","\n","openai.api_key = os.environ['OPENAI_API_KEY']"]},{"cell_type":"markdown","id":"460a54b0","metadata":{},"source":["前两节课我们讨论了`Document Loading`(文档加载)和`Splitting`(分割)。\n","\n","下面我们将使用前两节课的知识对文档进行加载分割。"]},{"cell_type":"markdown","id":"18e077a0","metadata":{},"source":["## 二、读取文档"]},{"cell_type":"markdown","id":"0c1e123e","metadata":{},"source":["下面文档的课程链接 https://see.stanford.edu/Course/CS229 ,可在该网站上下载对应的课程讲义"]},{"cell_type":"code","execution_count":2,"id":"2437469e","metadata":{"height":249,"tags":[]},"outputs":[],"source":["from langchain.document_loaders import PyPDFLoader\n","\n","# 加载 PDF\n","loaders = [\n"," # 故意添加重复文档,使数据混乱\n"," PyPDFLoader(\"docs/cs229_lectures/MachineLearning-Lecture01.pdf\"),\n"," PyPDFLoader(\"docs/cs229_lectures/MachineLearning-Lecture01.pdf\"),\n"," PyPDFLoader(\"docs/cs229_lectures/MachineLearning-Lecture02.pdf\"),\n"," PyPDFLoader(\"docs/cs229_lectures/MachineLearning-Lecture03.pdf\")\n","]\n","docs = []\n","for loader in loaders:\n"," docs.extend(loader.load())"]},{"cell_type":"markdown","id":"d1ed78d9","metadata":{},"source":["下面文档是datawhale官方开源的matplotlib教程链接 https://datawhalechina.github.io/fantastic-matplotlib/index.html ,可在该网站上下载对应的教程"]},{"cell_type":"code","execution_count":3,"id":"11de2150","metadata":{},"outputs":[],"source":["from langchain.document_loaders import PyPDFLoader\n","\n","# 加载 PDF\n","loaders_chinese = [\n"," # 故意添加重复文档,使数据混乱\n"," PyPDFLoader(\"docs/matplotlib/第一回:Matplotlib初相识.pdf\"),\n"," PyPDFLoader(\"docs/matplotlib/第一回:Matplotlib初相识.pdf\"),\n"," PyPDFLoader(\"docs/matplotlib/第二回:艺术画笔见乾坤.pdf\"),\n"," PyPDFLoader(\"docs/matplotlib/第三回:布局格式定方圆.pdf\")\n","]\n","docs_chinese = []\n","for loader in loaders_chinese:\n"," docs_chinese.extend(loader.load())"]},{"cell_type":"markdown","id":"4b497f5c","metadata":{},"source":["在文档加载后,我们可以使用`RecursiveCharacterTextSplitter`(递归字符文本拆分器)来创建块。"]},{"cell_type":"code","execution_count":4,"id":"eb44bf0d","metadata":{"height":115,"tags":[]},"outputs":[],"source":["# 分割文本\n","from langchain.text_splitter import RecursiveCharacterTextSplitter\n","text_splitter = RecursiveCharacterTextSplitter(\n"," chunk_size = 1500, # 每个文本块的大小。这意味着每次切分文本时,会尽量使每个块包含 1500 个字符。\n"," chunk_overlap = 150 # 每个文本块之间的重叠部分。\n",")"]},{"cell_type":"code","execution_count":5,"id":"b71e46cc","metadata":{"height":30,"tags":[]},"outputs":[],"source":["splits = text_splitter.split_documents(docs)"]},{"cell_type":"code","execution_count":6,"id":"e061f22d","metadata":{"height":30,"tags":[]},"outputs":[{"data":{"text/plain":["209"]},"execution_count":6,"metadata":{},"output_type":"execute_result"}],"source":["len(splits)"]},{"cell_type":"code","execution_count":7,"id":"99840d4c","metadata":{},"outputs":[],"source":["splits_chinese = text_splitter.split_documents(docs_chinese)"]},{"cell_type":"code","execution_count":8,"id":"d6b60ceb","metadata":{},"outputs":[{"data":{"text/plain":["27"]},"execution_count":8,"metadata":{},"output_type":"execute_result"}],"source":["len(splits_chinese)"]},{"cell_type":"markdown","id":"848e26fd","metadata":{},"source":["## 三、Embeddings\n","\n","什么是`Embeddings`?\n","\n","在机器学习和自然语言处理(NLP)中,`Embeddings`(嵌入)是一种将类别数据,如单词、句子或者整个文档,转化为实数向量的技术。这些实数向量可以被计算机更好地理解和处理。嵌入背后的主要想法是,相似或相关的对象在嵌入空间中的距离应该很近。\n","\n","举个例子,我们可以使用词嵌入(word embeddings)来表示文本数据。在词嵌入中,每个单词被转换为一个向量,这个向量捕获了这个单词的语义信息。例如,\"king\" 和 \"queen\" 这两个单词在嵌入空间中的位置将会非常接近,因为它们的含义相似。而 \"apple\" 和 \"orange\" 也会很接近,因为它们都是水果。而 \"king\" 和 \"apple\" 这两个单词在嵌入空间中的距离就会比较远,因为它们的含义不同。\n","\n","让我们取出我们的切分部分并对它们进行`Embedding`处理。"]},{"cell_type":"code","execution_count":9,"id":"d9dca7a8","metadata":{"height":47,"tags":[]},"outputs":[{"name":"stderr","output_type":"stream","text":["/Users/lta/anaconda3/envs/chat_data/lib/python3.11/site-packages/langchain_core/_api/deprecation.py:119: LangChainDeprecationWarning: The class `OpenAIEmbeddings` was deprecated in LangChain 0.0.9 and will be removed in 0.3.0. An updated version of the class exists in the langchain-openai package and should be used instead. To use it run `pip install -U langchain-openai` and import as `from langchain_openai import OpenAIEmbeddings`.\n"," warn_deprecated(\n"]}],"source":["from langchain.embeddings.openai import OpenAIEmbeddings\n","embedding = OpenAIEmbeddings(model='text-embedding-3-small')"]},{"cell_type":"markdown","id":"6ca11792","metadata":{},"source":["在使用真实文档数据的例子之前,让我们用几个测试案例的句子来试试,以便了解`embedding`。\n","\n","下面有几个示例句子,其中前两个非常相似,第三个与之无关。然后我们可以使用`embedding`类为每个句子创建一个`embedding`。"]},{"cell_type":"code","execution_count":10,"id":"abae34b5","metadata":{},"outputs":[],"source":["sentence1 = \"i like dogs\"\n","sentence2 = \"i like canines\"\n","sentence3 = \"the weather is ugly outside\""]},{"cell_type":"code","execution_count":11,"id":"d1745b19","metadata":{},"outputs":[],"source":["embedding1 = embedding.embed_query(sentence1)\n","embedding2 = embedding.embed_query(sentence2)\n","embedding3 = embedding.embed_query(sentence3)"]},{"cell_type":"code","execution_count":12,"id":"c4099521","metadata":{"height":64,"tags":[]},"outputs":[],"source":["sentence1_chinese = \"我喜欢狗\"\n","sentence2_chinese = \"我喜欢犬科动物\"\n","sentence3_chinese = \"外面的天气很糟糕\""]},{"cell_type":"code","execution_count":13,"id":"d553549a","metadata":{"height":64,"tags":[]},"outputs":[],"source":["embedding1_chinese = embedding.embed_query(sentence1_chinese)\n","embedding2_chinese = embedding.embed_query(sentence2_chinese)\n","embedding3_chinese = embedding.embed_query(sentence3_chinese)"]},{"cell_type":"markdown","id":"6b2805d4","metadata":{},"source":["然后我们可以使用`numpy`来比较它们,看看哪些最相似。\n","\n","我们期望前两个句子应该非常相似。\n","\n","然后,第一和第二个与第三个相比应该相差很大。\n","\n","我们将使用点积来比较两个嵌入。\n","\n","如果你不知道什么是点积,没关系。你只需要知道的重要一点是,分数越高句子越相似。"]},{"cell_type":"code","execution_count":14,"id":"0cbe9a9e","metadata":{"height":30,"tags":[]},"outputs":[],"source":["import numpy as np"]},{"cell_type":"code","execution_count":15,"id":"536db32e","metadata":{},"outputs":[{"data":{"text/plain":["0.8338763861124505"]},"execution_count":15,"metadata":{},"output_type":"execute_result"}],"source":["np.dot(embedding1, embedding2)"]},{"cell_type":"code","execution_count":16,"id":"29844c64","metadata":{},"outputs":[{"data":{"text/plain":["0.21898928790384764"]},"execution_count":16,"metadata":{},"output_type":"execute_result"}],"source":["np.dot(embedding1, embedding3)"]},{"cell_type":"code","execution_count":17,"id":"c1e17a9e","metadata":{},"outputs":[{"data":{"text/plain":["0.1850211777650424"]},"execution_count":17,"metadata":{},"output_type":"execute_result"}],"source":["np.dot(embedding2, embedding3)"]},{"cell_type":"markdown","id":"bdf08401","metadata":{},"source":["我们可以看到前两个`embedding`的分数相当高,为0.96。\n","\n","如果我们将第一个`embedding`与第三个`embedding`进行比较,我们可以看到它明显较低,约为0.77。\n","\n","如果我们将第二个`embedding`和第三个`embedding`进行比较,我们可以看到它的分数大约为0.75。"]},{"cell_type":"code","execution_count":18,"id":"49fc0f8f","metadata":{"height":30,"tags":[]},"outputs":[{"data":{"text/plain":["0.7232575326539132"]},"execution_count":18,"metadata":{},"output_type":"execute_result"}],"source":["np.dot(embedding1_chinese, embedding2_chinese)"]},{"cell_type":"code","execution_count":19,"id":"32a1fac7","metadata":{"height":30,"tags":[]},"outputs":[{"data":{"text/plain":["0.18710999861622954"]},"execution_count":19,"metadata":{},"output_type":"execute_result"}],"source":["np.dot(embedding1_chinese, embedding3_chinese)"]},{"cell_type":"code","execution_count":20,"id":"9dd18328","metadata":{"height":30,"tags":[]},"outputs":[{"data":{"text/plain":["0.13899606496112304"]},"execution_count":20,"metadata":{},"output_type":"execute_result"}],"source":["np.dot(embedding2_chinese, embedding3_chinese)"]},{"cell_type":"markdown","id":"9a17a527","metadata":{},"source":["我们可以看到前两个`embedding`的分数相当高,为0.94。\n","\n","如果我们将第一个`embedding`与第三个`embedding`进行比较,我们可以看到它明显较低,约为0.79。\n","\n","如果我们将第二个`embedding`和第三个`embedding`进行比较,我们可以看到它的分数大约为0.78。"]},{"cell_type":"markdown","id":"4fc7b24f","metadata":{},"source":["## 四、Vectorstores"]},{"cell_type":"markdown","id":"54776973","metadata":{},"source":["### 4.1 初始化Chroma"]},{"cell_type":"markdown","id":"20916c7c","metadata":{},"source":["Langchain集成了超过30个不同的向量存储库。我们选择Chroma是因为它轻量级且数据存储在内存中,这使得它非常容易启动和开始使用。"]},{"cell_type":"code","execution_count":21,"id":"201e6afa","metadata":{"height":30,"tags":[]},"outputs":[],"source":["from langchain.vectorstores import Chroma"]},{"cell_type":"code","execution_count":22,"id":"93960ac5","metadata":{"height":30,"tags":[]},"outputs":[],"source":["persist_directory = 'docs/chroma/cs229_lectures/'"]},{"cell_type":"code","execution_count":23,"id":"a195e72a","metadata":{"height":30,"tags":[]},"outputs":[],"source":["!rm -rf './docs/chroma/cs229_lectures' # 删除旧的数据库文件(如果文件夹中有文件的话),window电脑请手动删除"]},{"cell_type":"code","execution_count":24,"id":"690efd0a","metadata":{"height":98,"tags":[]},"outputs":[],"source":["vectordb = Chroma.from_documents(\n"," documents=splits,\n"," embedding=embedding,\n"," persist_directory=persist_directory # 允许我们将persist_directory目录保存到磁盘上\n",")"]},{"cell_type":"code","execution_count":25,"id":"f777480c","metadata":{"height":30,"tags":[]},"outputs":[{"name":"stdout","output_type":"stream","text":["209\n"]}],"source":["print(vectordb._collection.count())"]},{"cell_type":"code","execution_count":26,"id":"d720e2b7","metadata":{},"outputs":[],"source":["persist_directory_chinese = 'docs/chroma/matplotlib/'"]},{"cell_type":"code","execution_count":27,"id":"570a2768","metadata":{},"outputs":[],"source":["!rm -rf './docs/chroma/matplotlib' # 删除旧的数据库文件(如果文件夹中有文件的话)"]},{"cell_type":"code","execution_count":28,"id":"1e03438d","metadata":{},"outputs":[],"source":["vectordb_chinese = Chroma.from_documents(\n"," documents=splits_chinese,\n"," embedding=embedding,\n"," persist_directory=persist_directory_chinese # 允许我们将persist_directory目录保存到磁盘上\n",")"]},{"cell_type":"code","execution_count":29,"id":"cf87aba0","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["27\n"]}],"source":["print(vectordb_chinese._collection.count())"]},{"cell_type":"markdown","id":"9f78f412","metadata":{},"source":["我们可以看到英文版的长度也是209、中文版的长度也是30,这与我们之前的切分数量是一样的。现在让我们开始使用它。"]},{"cell_type":"markdown","id":"efca7589","metadata":{},"source":["### 4.2 相似性搜索(Similarity Search)"]},{"cell_type":"code","execution_count":30,"id":"3e20837d","metadata":{"height":30,"tags":[]},"outputs":[],"source":["question = \"is there an email i can ask for help\" # \"有我可以寻求帮助的电子邮件吗\""]},{"cell_type":"code","execution_count":31,"id":"f9bde572","metadata":{"height":30,"tags":[]},"outputs":[],"source":["docs = vectordb.similarity_search(question,k=3)"]},{"cell_type":"code","execution_count":32,"id":"41388af1","metadata":{"height":30,"tags":[]},"outputs":[{"data":{"text/plain":["3"]},"execution_count":32,"metadata":{},"output_type":"execute_result"}],"source":["len(docs)"]},{"cell_type":"code","execution_count":33,"id":"183434f6","metadata":{"height":30,"tags":[]},"outputs":[{"data":{"text/plain":["\"So all right, online resources. The class has a home page, so it's in on the handouts. I \\nwon't write on the chalkboard — http:// cs229.stanford.edu. And so when there are \\nhomework assignments or things like that, we usually won't sort of — in the mission of \\nsaving trees, we will usually not give out many handouts in class. So homework \\nassignments, homework solutions will be posted online at the course home page. \\nAs far as this class, I've also written, a nd I guess I've also revised every year a set of \\nfairly detailed lecture notes that cover the te chnical content of this class. And so if you \\nvisit the course homepage, you'll also find the detailed lecture notes that go over in detail \\nall the math and equations and so on that I'll be doing in class. \\nThere's also a newsgroup, su.class.cs229, also written on the handout. This is a \\nnewsgroup that's sort of a forum for people in the class to get to know each other and \\nhave whatever discussions you want to ha ve amongst yourselves. So the class newsgroup \\nwill not be monitored by the TAs and me. But this is a place for you to form study groups \\nor find project partners or discuss homework problems and so on, and it's not monitored \\nby the TAs and me. So feel free to ta lk trash about this class there. \\nIf you want to contact the teaching staff, pl ease use the email address written down here, \\ncs229-qa@cs.stanford.edu. This goes to an acc ount that's read by all the TAs and me. So\""]},"execution_count":33,"metadata":{},"output_type":"execute_result"}],"source":["docs[0].page_content"]},{"cell_type":"markdown","id":"1edb21d8","metadata":{},"source":["如果我们查看第一个文档的内容,我们可以看到它实际上是关于一个电子邮件地址,cs229-qa@cs.stanford.edu。\n","\n","这是我们可以向其发送问题的电子邮件,所有的助教都会阅读这些邮件。"]},{"cell_type":"code","execution_count":34,"id":"53bcc061","metadata":{},"outputs":[],"source":["question_chinese = \"Matplotlib是什么?\" "]},{"cell_type":"code","execution_count":35,"id":"2d2dc834","metadata":{},"outputs":[],"source":["docs_chinese = vectordb_chinese.similarity_search(question_chinese,k=3)"]},{"cell_type":"code","execution_count":36,"id":"eebe77e7","metadata":{},"outputs":[{"data":{"text/plain":["3"]},"execution_count":36,"metadata":{},"output_type":"execute_result"}],"source":["len(docs_chinese)"]},{"cell_type":"code","execution_count":37,"id":"860af154","metadata":{},"outputs":[{"data":{"text/plain":["'第⼀回:Matplotlib 初相识\\n⼀、认识matplotlib\\nMatplotlib 是⼀个 Python 2D 绘图库,能够以多种硬拷⻉格式和跨平台的交互式环境⽣成出版物质量的图形,⽤来绘制各种静态,动态,\\n交互式的图表。\\nMatplotlib 可⽤于 Python 脚本, Python 和 IPython Shell 、 Jupyter notebook , Web 应⽤程序服务器和各种图形⽤户界⾯⼯具包等。\\nMatplotlib 是 Python 数据可视化库中的泰⽃,它已经成为 python 中公认的数据可视化⼯具,我们所熟知的 pandas 和 seaborn 的绘图接⼝\\n其实也是基于 matplotlib 所作的⾼级封装。\\n为了对matplotlib 有更好的理解,让我们从⼀些最基本的概念开始认识它,再逐渐过渡到⼀些⾼级技巧中。\\n⼆、⼀个最简单的绘图例⼦\\nMatplotlib 的图像是画在 figure (如 windows , jupyter 窗体)上的,每⼀个 figure ⼜包含了⼀个或多个 axes (⼀个可以指定坐标系的⼦区\\n域)。最简单的创建 figure 以及 axes 的⽅式是通过 pyplot.subplots命令,创建 axes 以后,可以使⽤ Axes.plot绘制最简易的折线图。\\nimport matplotlib.pyplot as plt\\nimport matplotlib as mpl\\nimport numpy as np\\nfig, ax = plt.subplots() # 创建⼀个包含⼀个 axes 的 figure\\nax.plot([1, 2, 3, 4], [1, 4, 2, 3]); # 绘制图像\\nTrick: 在jupyter notebook 中使⽤ matplotlib 时会发现,代码运⾏后⾃动打印出类似 \\n这样⼀段话,这是因为 matplotlib 的绘图代码默认打印出最后⼀个对象。如果不想显示这句话,有以下三种⽅法,在本章节的代码示例\\n中你能找到这三种⽅法的使⽤。\\n\\x00. 在代码块最后加⼀个分号 ;\\n\\x00. 在代码块最后加⼀句 plt.show()\\n\\x00. 在绘图时将绘图对象显式赋值给⼀个变量,如将 plt.plot([1, 2, 3, 4]) 改成 line =plt.plot([1, 2, 3, 4])\\n和MATLAB 命令类似,你还可以通过⼀种更简单的⽅式绘制图像, matplotlib.pyplot⽅法能够直接在当前 axes 上绘制图像,如果⽤户\\n未指定axes , matplotlib 会帮你⾃动创建⼀个。所以上⾯的例⼦也可以简化为以下这⼀⾏代码。\\nline =plt.plot([1, 2, 3, 4], [1, 4, 2, 3]) \\n三、Figure 的组成\\n现在我们来深⼊看⼀下 figure 的组成。通过⼀张 figure 解剖图,我们可以看到⼀个完整的 matplotlib 图像通常会包括以下四个层级,这些\\n层级也被称为容器( container ),下⼀节会详细介绍。在 matplotlib 的世界中,我们将通过各种命令⽅法来操纵图像中的每⼀个部分,\\n从⽽达到数据可视化的最终效果,⼀副完整的图像实际上是各类⼦元素的集合。\\nFigure:顶层级,⽤来容纳所有绘图元素'"]},"execution_count":37,"metadata":{},"output_type":"execute_result"}],"source":["docs_chinese[0].page_content"]},{"cell_type":"markdown","id":"60a720a6","metadata":{},"source":["如果我们查看第一个文档的内容,我们可以看到它实际上是关于Matplotlib的介绍"]},{"cell_type":"markdown","id":"fba57a1d","metadata":{},"source":["在此之后,我们要确保通过运行vectordb.persist来持久化向量数据库,以便我们在未来的课程中使用。\n","\n","让我们保存它,以便以后使用!"]},{"cell_type":"code","execution_count":38,"id":"ea657123","metadata":{"height":30,"tags":[]},"outputs":[{"name":"stderr","output_type":"stream","text":["/Users/lta/anaconda3/envs/chat_data/lib/python3.11/site-packages/langchain_core/_api/deprecation.py:119: LangChainDeprecationWarning: Since Chroma 0.4.x the manual persistence method is no longer supported as docs are automatically persisted.\n"," warn_deprecated(\n"]}],"source":["vectordb.persist()"]},{"cell_type":"code","execution_count":39,"id":"9065f5b1","metadata":{},"outputs":[],"source":["vectordb_chinese.persist()"]},{"cell_type":"markdown","id":"cefe9f6a","metadata":{},"source":["## 五、失败的情况(Failure modes)"]},{"cell_type":"markdown","id":"089da0ca","metadata":{},"source":["这看起来很好,基本的相似性搜索很容易就能让你完成80%的工作。\n","\n","但是,可能会出现一些相似性搜索失败的情况。\n","\n","这里有一些可能出现的边缘情况——我们将在下一堂课中修复它们。"]},{"cell_type":"code","execution_count":40,"id":"df0f29f9","metadata":{"height":30,"tags":[]},"outputs":[],"source":["question = \"what did they say about matlab?\" # \"他们对 matlab 有何评价?\""]},{"cell_type":"code","execution_count":41,"id":"02be97df","metadata":{"height":30,"tags":[]},"outputs":[],"source":["docs = vectordb.similarity_search(question,k=5)"]},{"cell_type":"code","execution_count":42,"id":"34359d7a","metadata":{},"outputs":[],"source":["question_chinese = \"Matplotlib是什么?\""]},{"cell_type":"code","execution_count":43,"id":"c0546006","metadata":{},"outputs":[],"source":["docs_chinese = vectordb_chinese.similarity_search(question_chinese,k=5)"]},{"cell_type":"markdown","id":"2a9f579e","metadata":{},"source":["请注意,我们得到了重复的块(因为索引中有重复的 `MachineLearning-Lecture01.pdf`、`第一回:Matplotlib初相识.pdf`)。\n","\n","语义搜索获取所有相似的文档,但不强制多样性。\n","\n","`docs[0]` 和 `docs[1]` 是完全相同的,以及`docs_chinese[0]` 和 `docs_chinese[1]` 是完全相同的。"]},{"cell_type":"code","execution_count":44,"id":"d39f6954","metadata":{"height":30,"tags":[]},"outputs":[{"data":{"text/plain":["Document(page_content='those homeworks will be done in either MATLA B or in Octave, which is sort of — I \\nknow some people call it a free ve rsion of MATLAB, which it sort of is, sort of isn\\'t. \\nSo I guess for those of you that haven\\'t s een MATLAB before, and I know most of you \\nhave, MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to \\nplot data. And it\\'s sort of an extremely easy to learn tool to use for implementing a lot of \\nlearning algorithms. \\nAnd in case some of you want to work on your own home computer or something if you \\ndon\\'t have a MATLAB license, for the purposes of this class, there\\'s also — [inaudible] \\nwrite that down [inaudible] MATLAB — there\\' s also a software package called Octave \\nthat you can download for free off the Internet. And it has somewhat fewer features than MATLAB, but it\\'s free, and for the purposes of this class, it will work for just about \\neverything. \\nSo actually I, well, so yeah, just a side comment for those of you that haven\\'t seen \\nMATLAB before I guess, once a colleague of mine at a different university, not at \\nStanford, actually teaches another machine l earning course. He\\'s taught it for many years. \\nSo one day, he was in his office, and an old student of his from, lik e, ten years ago came \\ninto his office and he said, \"Oh, professo r, professor, thank you so much for your', metadata={'page': 8, 'source': 'docs/cs229_lectures/MachineLearning-Lecture01.pdf'})"]},"execution_count":44,"metadata":{},"output_type":"execute_result"}],"source":["docs[0]"]},{"cell_type":"code","execution_count":45,"id":"e04e3d1b","metadata":{"height":30,"tags":[]},"outputs":[{"data":{"text/plain":["Document(page_content='those homeworks will be done in either MATLA B or in Octave, which is sort of — I \\nknow some people call it a free ve rsion of MATLAB, which it sort of is, sort of isn\\'t. \\nSo I guess for those of you that haven\\'t s een MATLAB before, and I know most of you \\nhave, MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to \\nplot data. And it\\'s sort of an extremely easy to learn tool to use for implementing a lot of \\nlearning algorithms. \\nAnd in case some of you want to work on your own home computer or something if you \\ndon\\'t have a MATLAB license, for the purposes of this class, there\\'s also — [inaudible] \\nwrite that down [inaudible] MATLAB — there\\' s also a software package called Octave \\nthat you can download for free off the Internet. And it has somewhat fewer features than MATLAB, but it\\'s free, and for the purposes of this class, it will work for just about \\neverything. \\nSo actually I, well, so yeah, just a side comment for those of you that haven\\'t seen \\nMATLAB before I guess, once a colleague of mine at a different university, not at \\nStanford, actually teaches another machine l earning course. He\\'s taught it for many years. \\nSo one day, he was in his office, and an old student of his from, lik e, ten years ago came \\ninto his office and he said, \"Oh, professo r, professor, thank you so much for your', metadata={'page': 8, 'source': 'docs/cs229_lectures/MachineLearning-Lecture01.pdf'})"]},"execution_count":45,"metadata":{},"output_type":"execute_result"}],"source":["docs[1]"]},{"cell_type":"code","execution_count":46,"id":"092fd2f5","metadata":{},"outputs":[{"data":{"text/plain":["Document(page_content='第⼀回:Matplotlib 初相识\\n⼀、认识matplotlib\\nMatplotlib 是⼀个 Python 2D 绘图库,能够以多种硬拷⻉格式和跨平台的交互式环境⽣成出版物质量的图形,⽤来绘制各种静态,动态,\\n交互式的图表。\\nMatplotlib 可⽤于 Python 脚本, Python 和 IPython Shell 、 Jupyter notebook , Web 应⽤程序服务器和各种图形⽤户界⾯⼯具包等。\\nMatplotlib 是 Python 数据可视化库中的泰⽃,它已经成为 python 中公认的数据可视化⼯具,我们所熟知的 pandas 和 seaborn 的绘图接⼝\\n其实也是基于 matplotlib 所作的⾼级封装。\\n为了对matplotlib 有更好的理解,让我们从⼀些最基本的概念开始认识它,再逐渐过渡到⼀些⾼级技巧中。\\n⼆、⼀个最简单的绘图例⼦\\nMatplotlib 的图像是画在 figure (如 windows , jupyter 窗体)上的,每⼀个 figure ⼜包含了⼀个或多个 axes (⼀个可以指定坐标系的⼦区\\n域)。最简单的创建 figure 以及 axes 的⽅式是通过 pyplot.subplots命令,创建 axes 以后,可以使⽤ Axes.plot绘制最简易的折线图。\\nimport matplotlib.pyplot as plt\\nimport matplotlib as mpl\\nimport numpy as np\\nfig, ax = plt.subplots() # 创建⼀个包含⼀个 axes 的 figure\\nax.plot([1, 2, 3, 4], [1, 4, 2, 3]); # 绘制图像\\nTrick: 在jupyter notebook 中使⽤ matplotlib 时会发现,代码运⾏后⾃动打印出类似 \\n这样⼀段话,这是因为 matplotlib 的绘图代码默认打印出最后⼀个对象。如果不想显示这句话,有以下三种⽅法,在本章节的代码示例\\n中你能找到这三种⽅法的使⽤。\\n\\x00. 在代码块最后加⼀个分号 ;\\n\\x00. 在代码块最后加⼀句 plt.show()\\n\\x00. 在绘图时将绘图对象显式赋值给⼀个变量,如将 plt.plot([1, 2, 3, 4]) 改成 line =plt.plot([1, 2, 3, 4])\\n和MATLAB 命令类似,你还可以通过⼀种更简单的⽅式绘制图像, matplotlib.pyplot⽅法能够直接在当前 axes 上绘制图像,如果⽤户\\n未指定axes , matplotlib 会帮你⾃动创建⼀个。所以上⾯的例⼦也可以简化为以下这⼀⾏代码。\\nline =plt.plot([1, 2, 3, 4], [1, 4, 2, 3]) \\n三、Figure 的组成\\n现在我们来深⼊看⼀下 figure 的组成。通过⼀张 figure 解剖图,我们可以看到⼀个完整的 matplotlib 图像通常会包括以下四个层级,这些\\n层级也被称为容器( container ),下⼀节会详细介绍。在 matplotlib 的世界中,我们将通过各种命令⽅法来操纵图像中的每⼀个部分,\\n从⽽达到数据可视化的最终效果,⼀副完整的图像实际上是各类⼦元素的集合。\\nFigure:顶层级,⽤来容纳所有绘图元素', metadata={'page': 0, 'source': 'docs/matplotlib/第一回:Matplotlib初相识.pdf'})"]},"execution_count":46,"metadata":{},"output_type":"execute_result"}],"source":["docs_chinese[0]"]},{"cell_type":"code","execution_count":47,"id":"270f0a10","metadata":{},"outputs":[{"data":{"text/plain":["Document(page_content='第⼀回:Matplotlib 初相识\\n⼀、认识matplotlib\\nMatplotlib 是⼀个 Python 2D 绘图库,能够以多种硬拷⻉格式和跨平台的交互式环境⽣成出版物质量的图形,⽤来绘制各种静态,动态,\\n交互式的图表。\\nMatplotlib 可⽤于 Python 脚本, Python 和 IPython Shell 、 Jupyter notebook , Web 应⽤程序服务器和各种图形⽤户界⾯⼯具包等。\\nMatplotlib 是 Python 数据可视化库中的泰⽃,它已经成为 python 中公认的数据可视化⼯具,我们所熟知的 pandas 和 seaborn 的绘图接⼝\\n其实也是基于 matplotlib 所作的⾼级封装。\\n为了对matplotlib 有更好的理解,让我们从⼀些最基本的概念开始认识它,再逐渐过渡到⼀些⾼级技巧中。\\n⼆、⼀个最简单的绘图例⼦\\nMatplotlib 的图像是画在 figure (如 windows , jupyter 窗体)上的,每⼀个 figure ⼜包含了⼀个或多个 axes (⼀个可以指定坐标系的⼦区\\n域)。最简单的创建 figure 以及 axes 的⽅式是通过 pyplot.subplots命令,创建 axes 以后,可以使⽤ Axes.plot绘制最简易的折线图。\\nimport matplotlib.pyplot as plt\\nimport matplotlib as mpl\\nimport numpy as np\\nfig, ax = plt.subplots() # 创建⼀个包含⼀个 axes 的 figure\\nax.plot([1, 2, 3, 4], [1, 4, 2, 3]); # 绘制图像\\nTrick: 在jupyter notebook 中使⽤ matplotlib 时会发现,代码运⾏后⾃动打印出类似 \\n这样⼀段话,这是因为 matplotlib 的绘图代码默认打印出最后⼀个对象。如果不想显示这句话,有以下三种⽅法,在本章节的代码示例\\n中你能找到这三种⽅法的使⽤。\\n\\x00. 在代码块最后加⼀个分号 ;\\n\\x00. 在代码块最后加⼀句 plt.show()\\n\\x00. 在绘图时将绘图对象显式赋值给⼀个变量,如将 plt.plot([1, 2, 3, 4]) 改成 line =plt.plot([1, 2, 3, 4])\\n和MATLAB 命令类似,你还可以通过⼀种更简单的⽅式绘制图像, matplotlib.pyplot⽅法能够直接在当前 axes 上绘制图像,如果⽤户\\n未指定axes , matplotlib 会帮你⾃动创建⼀个。所以上⾯的例⼦也可以简化为以下这⼀⾏代码。\\nline =plt.plot([1, 2, 3, 4], [1, 4, 2, 3]) \\n三、Figure 的组成\\n现在我们来深⼊看⼀下 figure 的组成。通过⼀张 figure 解剖图,我们可以看到⼀个完整的 matplotlib 图像通常会包括以下四个层级,这些\\n层级也被称为容器( container ),下⼀节会详细介绍。在 matplotlib 的世界中,我们将通过各种命令⽅法来操纵图像中的每⼀个部分,\\n从⽽达到数据可视化的最终效果,⼀副完整的图像实际上是各类⼦元素的集合。\\nFigure:顶层级,⽤来容纳所有绘图元素', metadata={'page': 0, 'source': 'docs/matplotlib/第一回:Matplotlib初相识.pdf'})"]},"execution_count":47,"metadata":{},"output_type":"execute_result"}],"source":["docs_chinese[1]"]},{"cell_type":"markdown","id":"3a3a915d","metadata":{},"source":["我们可以看到一种新的失败的情况。\n","\n","下面的问题询问了关于第三讲的问题,但也包括了来自其他讲的结果。"]},{"cell_type":"code","execution_count":48,"id":"b19135e5","metadata":{"height":45,"tags":[]},"outputs":[],"source":["question = \"what did they say about regression in the third lecture?\" # \"他们在第三讲中是怎么谈论回归的?\""]},{"cell_type":"code","execution_count":49,"id":"d434942c","metadata":{"height":30,"tags":[]},"outputs":[],"source":["docs = vectordb.similarity_search(question,k=5)"]},{"cell_type":"code","execution_count":50,"id":"c2c5df59","metadata":{"height":47,"tags":[]},"outputs":[{"name":"stdout","output_type":"stream","text":["{'page': 0, 'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf'}\n","{'page': 14, 'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf'}\n","{'page': 6, 'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf'}\n","{'page': 0, 'source': 'docs/cs229_lectures/MachineLearning-Lecture02.pdf'}\n","{'page': 2, 'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf'}\n"]}],"source":["for doc in docs:\n"," print(doc.metadata)"]},{"cell_type":"code","execution_count":51,"id":"8e6cb50c","metadata":{"height":30,"tags":[]},"outputs":[{"name":"stdout","output_type":"stream","text":["regression problem like this. What I want to do today is talk about a class of algorithms \n","called non-parametric learning algorithms that will help to alleviate the need somewhat \n","for you to choose features very carefully. Okay ? And this leads us in to our discussion of \n","locally weighted regression. And just to de fine the term, linear regression, as we’ve \n","defined it so far, is an example of a parame tric learning algorithm. Parametric learning \n","algorithm is one that’s defined as an algorithm that has a fixed number of parameters that \n","fit to the data. Okay? So in linear regression we have a fix set of parameters theta, right? \n","That must fit to the data. In contrast, what I’m gonna talk about now is our first non-\n","parametric learning algorithm. The formal defi nition, which is not very intuitive, so I’ve \n","replaced it with a second, say, more intuitive. The, sort of, formal definition of the non-\n","parametric learning algorithm is that it’s an algorithm where the number of parameters \n","goes with M, with the size of the training se t. And usually it’s de fined as a number of \n","parameters grows linearly with the size of the training set. Th is is the formal definition. A \n","slightly less formal definition is that th e amount of stuff that your learning algorithm \n","needs to keep around will grow linearly with th e training sets or, in another way of saying \n","it, is that this is an algorithm that we’ll n eed to keep around an entire training set, even\n"]}],"source":["print(docs[4].page_content)"]},{"cell_type":"code","execution_count":52,"id":"e0a7e9c8","metadata":{},"outputs":[],"source":["question_chinese = \"他们在第二讲中对Figure说了些什么?\" "]},{"cell_type":"code","execution_count":53,"id":"62c0bfc6","metadata":{},"outputs":[],"source":["docs_chinese = vectordb_chinese.similarity_search(question_chinese,k=5)"]},{"cell_type":"code","execution_count":54,"id":"26bd586d","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["{'page': 9, 'source': 'docs/matplotlib/第二回:艺术画笔见乾坤.pdf'}\n","{'page': 0, 'source': 'docs/matplotlib/第一回:Matplotlib初相识.pdf'}\n","{'page': 0, 'source': 'docs/matplotlib/第一回:Matplotlib初相识.pdf'}\n","{'page': 0, 'source': 'docs/matplotlib/第三回:布局格式定方圆.pdf'}\n","{'page': 0, 'source': 'docs/matplotlib/第二回:艺术画笔见乾坤.pdf'}\n"]}],"source":["for doc_chinese in docs_chinese:\n"," print(doc_chinese.metadata)"]},{"cell_type":"code","execution_count":55,"id":"71b6bbf3","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["第⼀回:Matplotlib 初相识\n","⼀、认识matplotlib\n","Matplotlib 是⼀个 Python 2D 绘图库,能够以多种硬拷⻉格式和跨平台的交互式环境⽣成出版物质量的图形,⽤来绘制各种静态,动态,\n","交互式的图表。\n","Matplotlib 可⽤于 Python 脚本, Python 和 IPython Shell 、 Jupyter notebook , Web 应⽤程序服务器和各种图形⽤户界⾯⼯具包等。\n","Matplotlib 是 Python 数据可视化库中的泰⽃,它已经成为 python 中公认的数据可视化⼯具,我们所熟知的 pandas 和 seaborn 的绘图接⼝\n","其实也是基于 matplotlib 所作的⾼级封装。\n","为了对matplotlib 有更好的理解,让我们从⼀些最基本的概念开始认识它,再逐渐过渡到⼀些⾼级技巧中。\n","⼆、⼀个最简单的绘图例⼦\n","Matplotlib 的图像是画在 figure (如 windows , jupyter 窗体)上的,每⼀个 figure ⼜包含了⼀个或多个 axes (⼀个可以指定坐标系的⼦区\n","域)。最简单的创建 figure 以及 axes 的⽅式是通过 pyplot.subplots命令,创建 axes 以后,可以使⽤ Axes.plot绘制最简易的折线图。\n","import matplotlib.pyplot as plt\n","import matplotlib as mpl\n","import numpy as np\n","fig, ax = plt.subplots() # 创建⼀个包含⼀个 axes 的 figure\n","ax.plot([1, 2, 3, 4], [1, 4, 2, 3]); # 绘制图像\n","Trick: 在jupyter notebook 中使⽤ matplotlib 时会发现,代码运⾏后⾃动打印出类似 \n","这样⼀段话,这是因为 matplotlib 的绘图代码默认打印出最后⼀个对象。如果不想显示这句话,有以下三种⽅法,在本章节的代码示例\n","中你能找到这三种⽅法的使⽤。\n","\u0000. 在代码块最后加⼀个分号 ;\n","\u0000. 在代码块最后加⼀句 plt.show()\n","\u0000. 在绘图时将绘图对象显式赋值给⼀个变量,如将 plt.plot([1, 2, 3, 4]) 改成 line =plt.plot([1, 2, 3, 4])\n","和MATLAB 命令类似,你还可以通过⼀种更简单的⽅式绘制图像, matplotlib.pyplot⽅法能够直接在当前 axes 上绘制图像,如果⽤户\n","未指定axes , matplotlib 会帮你⾃动创建⼀个。所以上⾯的例⼦也可以简化为以下这⼀⾏代码。\n","line =plt.plot([1, 2, 3, 4], [1, 4, 2, 3]) \n","三、Figure 的组成\n","现在我们来深⼊看⼀下 figure 的组成。通过⼀张 figure 解剖图,我们可以看到⼀个完整的 matplotlib 图像通常会包括以下四个层级,这些\n","层级也被称为容器( container ),下⼀节会详细介绍。在 matplotlib 的世界中,我们将通过各种命令⽅法来操纵图像中的每⼀个部分,\n","从⽽达到数据可视化的最终效果,⼀副完整的图像实际上是各类⼦元素的集合。\n","Figure:顶层级,⽤来容纳所有绘图元素\n"]}],"source":["print(docs_chinese[2].page_content)"]},{"cell_type":"markdown","id":"c3dbca56","metadata":{},"source":["在下一讲中讨论的方法可以用来解决这两个问题!"]}],"metadata":{"kernelspec":{"display_name":"auto-gpt","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.1.-1"}},"nbformat":4,"nbformat_minor":5}
+{"cells": [{"cell_type": "markdown", "id": "0d647f70", "metadata": {}, "source": ["# \u7b2c\u56db\u7ae0 \u5411\u91cf\u6570\u636e\u5e93\u4e0e\u8bcd\u5411\u91cf(Vectorstores and Embeddings)\n", "\n", " - [\u4e00\u3001\u73af\u5883\u914d\u7f6e](#\u4e00\u3001\u73af\u5883\u914d\u7f6e)\n", " - [\u4e8c\u3001\u8bfb\u53d6\u6587\u6863](#\u4e8c\u3001\u8bfb\u53d6\u6587\u6863)\n", " - [\u4e09\u3001Embeddings](#\u4e09\u3001Embeddings)\n", " - [\u56db\u3001Vectorstores](#\u56db\u3001Vectorstores)\n", " - [4.1 \u521d\u59cb\u5316Chroma](#4.1-\u521d\u59cb\u5316Chroma)\n", " - [4.2 \u76f8\u4f3c\u6027\u641c\u7d22(Similarity Search)](#4.2-\u76f8\u4f3c\u6027\u641c\u7d22(Similarity-Search))\n", " - [\u4e94\u3001\u5931\u8d25\u7684\u60c5\u51b5(Failure modes)](#\u4e94\u3001\u5931\u8d25\u7684\u60c5\u51b5(Failure-modes))\n"]}, {"cell_type": "markdown", "id": "ea9da9ec", "metadata": {}, "source": ["\u56de\u987e\u4e00\u4e0b\u68c0\u7d22\u589e\u5f3a\u751f\u6210\uff08RAG\uff09\u7684\u6574\u4f53\u5de5\u4f5c\u6d41\u7a0b\uff1a"]}, {"attachments": {"overview.jpeg": {"image/jpeg": ""}}, "cell_type": "markdown", "id": "125247c2", "metadata": {}, "source": [""]}, {"cell_type": "markdown", "id": "88e5ba0b", "metadata": {}, "source": ["## \u4e00\u3001\u73af\u5883\u914d\u7f6e"]}, {"cell_type": "markdown", "id": "f3d033b6", "metadata": {}, "source": ["\u5728\u5f53\u524d\u6587\u4ef6\u5939\u4e0b\u65b0\u5efa`.env`\u6587\u4ef6\uff0c\u5185\u5bb9\u4e3a`OPENAI_API_KEY = \"sk-...\"`\n", "\n", "\u7531\u4e8e\u672c\u7ae0\u8282\u9700\u8981\u4f7f\u7528`PyPDFLoader`\u3001`Chroma`\uff0c\u6545\u9700\u8981\u5b89\u88c5\u4f9d\u8d56\u5305`pypdf`\u3001`chromadb`"]}, {"cell_type": "code", "execution_count": 3, "id": "8ed9c2eb", "metadata": {}, "outputs": [], "source": ["!pip install -Uq pypdf\n", "!pip install -Uq chromadb"]}, {"cell_type": "code", "execution_count": 5, "id": "7f8d2266-4a35-4904-ae9d-c89790c5ae61", "metadata": {"height": 166, "tags": []}, "outputs": [], "source": ["import os\n", "import openai\n", "import sys\n", "sys.path.append('../..')\n", "\n", "from dotenv import load_dotenv, find_dotenv\n", "_ = load_dotenv(find_dotenv()) # \u8bfb\u53d6\u672c\u5730.env\u6587\u4ef6\n", "\n", "openai.api_key = os.environ['OPENAI_API_KEY']"]}, {"cell_type": "markdown", "id": "460a54b0", "metadata": {}, "source": ["\u524d\u4e24\u8282\u8bfe\u6211\u4eec\u8ba8\u8bba\u4e86`Document Loading`\uff08\u6587\u6863\u52a0\u8f7d\uff09\u548c`Splitting`\uff08\u5206\u5272\uff09\u3002\n", "\n", "\u4e0b\u9762\u6211\u4eec\u5c06\u4f7f\u7528\u524d\u4e24\u8282\u8bfe\u7684\u77e5\u8bc6\u5bf9\u6587\u6863\u8fdb\u884c\u52a0\u8f7d\u5206\u5272\u3002"]}, {"cell_type": "markdown", "id": "18e077a0", "metadata": {}, "source": ["## \u4e8c\u3001\u8bfb\u53d6\u6587\u6863"]}, {"cell_type": "markdown", "id": "0c1e123e", "metadata": {}, "source": ["\u4e0b\u9762\u6587\u6863\u7684\u8bfe\u7a0b\u94fe\u63a5 https://see.stanford.edu/Course/CS229 \uff0c\u53ef\u5728\u8be5\u7f51\u7ad9\u4e0a\u4e0b\u8f7d\u5bf9\u5e94\u7684\u8bfe\u7a0b\u8bb2\u4e49"]}, {"cell_type": "code", "execution_count": 6, "id": "2437469e", "metadata": {"height": 249, "tags": []}, "outputs": [], "source": ["from langchain.document_loaders import PyPDFLoader\n", "\n", "# \u52a0\u8f7d PDF\n", "loaders = [\n", " # \u6545\u610f\u6dfb\u52a0\u91cd\u590d\u6587\u6863\uff0c\u4f7f\u6570\u636e\u6df7\u4e71\n", " PyPDFLoader(\"docs/cs229_lectures/MachineLearning-Lecture01.pdf\"),\n", " PyPDFLoader(\"docs/cs229_lectures/MachineLearning-Lecture01.pdf\"),\n", " PyPDFLoader(\"docs/cs229_lectures/MachineLearning-Lecture02.pdf\"),\n", " PyPDFLoader(\"docs/cs229_lectures/MachineLearning-Lecture03.pdf\")\n", "]\n", "docs = []\n", "for loader in loaders:\n", " docs.extend(loader.load())"]}, {"cell_type": "markdown", "id": "d1ed78d9", "metadata": {}, "source": ["\u4e0b\u9762\u6587\u6863\u662fdatawhale\u5b98\u65b9\u5f00\u6e90\u7684matplotlib\u6559\u7a0b\u94fe\u63a5 https://datawhalechina.github.io/fantastic-matplotlib/index.html \uff0c\u53ef\u5728\u8be5\u7f51\u7ad9\u4e0a\u4e0b\u8f7d\u5bf9\u5e94\u7684\u6559\u7a0b"]}, {"cell_type": "code", "execution_count": 60, "id": "11de2150", "metadata": {}, "outputs": [], "source": ["from langchain.document_loaders import PyPDFLoader\n", "\n", "# \u52a0\u8f7d PDF\n", "loaders_chinese = [\n", " # \u6545\u610f\u6dfb\u52a0\u91cd\u590d\u6587\u6863\uff0c\u4f7f\u6570\u636e\u6df7\u4e71\n", " PyPDFLoader(\"docs/matplotlib/\u7b2c\u4e00\u56de\uff1aMatplotlib\u521d\u76f8\u8bc6.pdf\"),\n", " PyPDFLoader(\"docs/matplotlib/\u7b2c\u4e00\u56de\uff1aMatplotlib\u521d\u76f8\u8bc6.pdf\"),\n", " PyPDFLoader(\"docs/matplotlib/\u7b2c\u4e8c\u56de\uff1a\u827a\u672f\u753b\u7b14\u89c1\u4e7e\u5764.pdf\"),\n", " PyPDFLoader(\"docs/matplotlib/\u7b2c\u4e09\u56de\uff1a\u5e03\u5c40\u683c\u5f0f\u5b9a\u65b9\u5706.pdf\")\n", "]\n", "docs_chinese = []\n", "for loader in loaders_chinese:\n", " docs_chinese.extend(loader.load())"]}, {"cell_type": "markdown", "id": "4b497f5c", "metadata": {}, "source": ["\u5728\u6587\u6863\u52a0\u8f7d\u540e\uff0c\u6211\u4eec\u53ef\u4ee5\u4f7f\u7528`RecursiveCharacterTextSplitter`(\u9012\u5f52\u5b57\u7b26\u6587\u672c\u62c6\u5206\u5668)\u6765\u521b\u5efa\u5757\u3002"]}, {"cell_type": "code", "execution_count": 61, "id": "eb44bf0d", "metadata": {"height": 115, "tags": []}, "outputs": [], "source": ["# \u5206\u5272\u6587\u672c\n", "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", "text_splitter = RecursiveCharacterTextSplitter(\n", " chunk_size = 1500, # \u6bcf\u4e2a\u6587\u672c\u5757\u7684\u5927\u5c0f\u3002\u8fd9\u610f\u5473\u7740\u6bcf\u6b21\u5207\u5206\u6587\u672c\u65f6\uff0c\u4f1a\u5c3d\u91cf\u4f7f\u6bcf\u4e2a\u5757\u5305\u542b 1500 \u4e2a\u5b57\u7b26\u3002\n", " chunk_overlap = 150 # \u6bcf\u4e2a\u6587\u672c\u5757\u4e4b\u95f4\u7684\u91cd\u53e0\u90e8\u5206\u3002\n", ")"]}, {"cell_type": "code", "execution_count": 62, "id": "b71e46cc", "metadata": {"height": 30, "tags": []}, "outputs": [], "source": ["splits = text_splitter.split_documents(docs)"]}, {"cell_type": "code", "execution_count": 63, "id": "e061f22d", "metadata": {"height": 30, "tags": []}, "outputs": [{"data": {"text/plain": ["209"]}, "execution_count": 63, "metadata": {}, "output_type": "execute_result"}], "source": ["len(splits)"]}, {"cell_type": "code", "execution_count": 64, "id": "99840d4c", "metadata": {}, "outputs": [], "source": ["splits_chinese = text_splitter.split_documents(docs_chinese)"]}, {"cell_type": "code", "execution_count": 65, "id": "d6b60ceb", "metadata": {}, "outputs": [{"data": {"text/plain": ["27"]}, "execution_count": 65, "metadata": {}, "output_type": "execute_result"}], "source": ["len(splits_chinese)"]}, {"cell_type": "markdown", "id": "848e26fd", "metadata": {}, "source": ["## \u4e09\u3001Embeddings\n", "\n", "\u4ec0\u4e48\u662f`Embeddings`\uff1f\n", "\n", "\u5728\u673a\u5668\u5b66\u4e60\u548c\u81ea\u7136\u8bed\u8a00\u5904\u7406\uff08NLP\uff09\u4e2d\uff0c`Embeddings`\uff08\u5d4c\u5165\uff09\u662f\u4e00\u79cd\u5c06\u7c7b\u522b\u6570\u636e\uff0c\u5982\u5355\u8bcd\u3001\u53e5\u5b50\u6216\u8005\u6574\u4e2a\u6587\u6863\uff0c\u8f6c\u5316\u4e3a\u5b9e\u6570\u5411\u91cf\u7684\u6280\u672f\u3002\u8fd9\u4e9b\u5b9e\u6570\u5411\u91cf\u53ef\u4ee5\u88ab\u8ba1\u7b97\u673a\u66f4\u597d\u5730\u7406\u89e3\u548c\u5904\u7406\u3002\u5d4c\u5165\u80cc\u540e\u7684\u4e3b\u8981\u60f3\u6cd5\u662f\uff0c\u76f8\u4f3c\u6216\u76f8\u5173\u7684\u5bf9\u8c61\u5728\u5d4c\u5165\u7a7a\u95f4\u4e2d\u7684\u8ddd\u79bb\u5e94\u8be5\u5f88\u8fd1\u3002\n", "\n", "\u4e3e\u4e2a\u4f8b\u5b50\uff0c\u6211\u4eec\u53ef\u4ee5\u4f7f\u7528\u8bcd\u5d4c\u5165\uff08word embeddings\uff09\u6765\u8868\u793a\u6587\u672c\u6570\u636e\u3002\u5728\u8bcd\u5d4c\u5165\u4e2d\uff0c\u6bcf\u4e2a\u5355\u8bcd\u88ab\u8f6c\u6362\u4e3a\u4e00\u4e2a\u5411\u91cf\uff0c\u8fd9\u4e2a\u5411\u91cf\u6355\u83b7\u4e86\u8fd9\u4e2a\u5355\u8bcd\u7684\u8bed\u4e49\u4fe1\u606f\u3002\u4f8b\u5982\uff0c\"king\" \u548c \"queen\" \u8fd9\u4e24\u4e2a\u5355\u8bcd\u5728\u5d4c\u5165\u7a7a\u95f4\u4e2d\u7684\u4f4d\u7f6e\u5c06\u4f1a\u975e\u5e38\u63a5\u8fd1\uff0c\u56e0\u4e3a\u5b83\u4eec\u7684\u542b\u4e49\u76f8\u4f3c\u3002\u800c \"apple\" \u548c \"orange\" \u4e5f\u4f1a\u5f88\u63a5\u8fd1\uff0c\u56e0\u4e3a\u5b83\u4eec\u90fd\u662f\u6c34\u679c\u3002\u800c \"king\" \u548c \"apple\" \u8fd9\u4e24\u4e2a\u5355\u8bcd\u5728\u5d4c\u5165\u7a7a\u95f4\u4e2d\u7684\u8ddd\u79bb\u5c31\u4f1a\u6bd4\u8f83\u8fdc\uff0c\u56e0\u4e3a\u5b83\u4eec\u7684\u542b\u4e49\u4e0d\u540c\u3002\n", "\n", "\u8ba9\u6211\u4eec\u53d6\u51fa\u6211\u4eec\u7684\u5207\u5206\u90e8\u5206\u5e76\u5bf9\u5b83\u4eec\u8fdb\u884c`Embedding`\u5904\u7406\u3002"]}, {"cell_type": "code", "execution_count": 13, "id": "d9dca7a8", "metadata": {"height": 47, "tags": []}, "outputs": [], "source": ["from langchain.embeddings.openai import OpenAIEmbeddings\n", "embedding = OpenAIEmbeddings()"]}, {"cell_type": "markdown", "id": "6ca11792", "metadata": {}, "source": ["\u5728\u4f7f\u7528\u771f\u5b9e\u6587\u6863\u6570\u636e\u7684\u4f8b\u5b50\u4e4b\u524d\uff0c\u8ba9\u6211\u4eec\u7528\u51e0\u4e2a\u6d4b\u8bd5\u6848\u4f8b\u7684\u53e5\u5b50\u6765\u8bd5\u8bd5\uff0c\u4ee5\u4fbf\u4e86\u89e3`embedding`\u3002\n", "\n", "\u4e0b\u9762\u6709\u51e0\u4e2a\u793a\u4f8b\u53e5\u5b50\uff0c\u5176\u4e2d\u524d\u4e24\u4e2a\u975e\u5e38\u76f8\u4f3c\uff0c\u7b2c\u4e09\u4e2a\u4e0e\u4e4b\u65e0\u5173\u3002\u7136\u540e\u6211\u4eec\u53ef\u4ee5\u4f7f\u7528`embedding`\u7c7b\u4e3a\u6bcf\u4e2a\u53e5\u5b50\u521b\u5efa\u4e00\u4e2a`embedding`\u3002"]}, {"cell_type": "code", "execution_count": 14, "id": "abae34b5", "metadata": {}, "outputs": [], "source": ["sentence1 = \"i like dogs\"\n", "sentence2 = \"i like canines\"\n", "sentence3 = \"the weather is ugly outside\""]}, {"cell_type": "code", "execution_count": 15, "id": "d1745b19", "metadata": {}, "outputs": [], "source": ["embedding1 = embedding.embed_query(sentence1)\n", "embedding2 = embedding.embed_query(sentence2)\n", "embedding3 = embedding.embed_query(sentence3)"]}, {"cell_type": "code", "execution_count": 16, "id": "c4099521", "metadata": {"height": 64, "tags": []}, "outputs": [], "source": ["sentence1_chinese = \"\u6211\u559c\u6b22\u72d7\"\n", "sentence2_chinese = \"\u6211\u559c\u6b22\u72ac\u79d1\u52a8\u7269\"\n", "sentence3_chinese = \"\u5916\u9762\u7684\u5929\u6c14\u5f88\u7cdf\u7cd5\""]}, {"cell_type": "code", "execution_count": 17, "id": "d553549a", "metadata": {"height": 64, "tags": []}, "outputs": [], "source": ["embedding1_chinese = embedding.embed_query(sentence1_chinese)\n", "embedding2_chinese = embedding.embed_query(sentence2_chinese)\n", "embedding3_chinese = embedding.embed_query(sentence3_chinese)"]}, {"cell_type": "markdown", "id": "6b2805d4", "metadata": {}, "source": ["\u7136\u540e\u6211\u4eec\u53ef\u4ee5\u4f7f\u7528`numpy`\u6765\u6bd4\u8f83\u5b83\u4eec\uff0c\u770b\u770b\u54ea\u4e9b\u6700\u76f8\u4f3c\u3002\n", "\n", "\u6211\u4eec\u671f\u671b\u524d\u4e24\u4e2a\u53e5\u5b50\u5e94\u8be5\u975e\u5e38\u76f8\u4f3c\u3002\n", "\n", "\u7136\u540e\uff0c\u7b2c\u4e00\u548c\u7b2c\u4e8c\u4e2a\u4e0e\u7b2c\u4e09\u4e2a\u76f8\u6bd4\u5e94\u8be5\u76f8\u5dee\u5f88\u5927\u3002\n", "\n", "\u6211\u4eec\u5c06\u4f7f\u7528\u70b9\u79ef\u6765\u6bd4\u8f83\u4e24\u4e2a\u5d4c\u5165\u3002\n", "\n", "\u5982\u679c\u4f60\u4e0d\u77e5\u9053\u4ec0\u4e48\u662f\u70b9\u79ef\uff0c\u6ca1\u5173\u7cfb\u3002\u4f60\u53ea\u9700\u8981\u77e5\u9053\u7684\u91cd\u8981\u4e00\u70b9\u662f\uff0c\u5206\u6570\u8d8a\u9ad8\u53e5\u5b50\u8d8a\u76f8\u4f3c\u3002"]}, {"cell_type": "code", "execution_count": 18, "id": "0cbe9a9e", "metadata": {"height": 30, "tags": []}, "outputs": [], "source": ["import numpy as np"]}, {"cell_type": "code", "execution_count": 19, "id": "536db32e", "metadata": {}, "outputs": [{"data": {"text/plain": ["0.9631853877103518"]}, "execution_count": 19, "metadata": {}, "output_type": "execute_result"}], "source": ["np.dot(embedding1, embedding2)"]}, {"cell_type": "code", "execution_count": 20, "id": "29844c64", "metadata": {}, "outputs": [{"data": {"text/plain": ["0.7709997651294672"]}, "execution_count": 20, "metadata": {}, "output_type": "execute_result"}], "source": ["np.dot(embedding1, embedding3)"]}, {"cell_type": "code", "execution_count": 21, "id": "c1e17a9e", "metadata": {}, "outputs": [{"data": {"text/plain": ["0.7596334120325523"]}, "execution_count": 21, "metadata": {}, "output_type": "execute_result"}], "source": ["np.dot(embedding2, embedding3)"]}, {"cell_type": "markdown", "id": "bdf08401", "metadata": {}, "source": ["\u6211\u4eec\u53ef\u4ee5\u770b\u5230\u524d\u4e24\u4e2a`embedding`\u7684\u5206\u6570\u76f8\u5f53\u9ad8\uff0c\u4e3a0.96\u3002\n", "\n", "\u5982\u679c\u6211\u4eec\u5c06\u7b2c\u4e00\u4e2a`embedding`\u4e0e\u7b2c\u4e09\u4e2a`embedding`\u8fdb\u884c\u6bd4\u8f83\uff0c\u6211\u4eec\u53ef\u4ee5\u770b\u5230\u5b83\u660e\u663e\u8f83\u4f4e\uff0c\u7ea6\u4e3a0.77\u3002\n", "\n", "\u5982\u679c\u6211\u4eec\u5c06\u7b2c\u4e8c\u4e2a`embedding`\u548c\u7b2c\u4e09\u4e2a`embedding`\u8fdb\u884c\u6bd4\u8f83\uff0c\u6211\u4eec\u53ef\u4ee5\u770b\u5230\u5b83\u7684\u5206\u6570\u5927\u7ea6\u4e3a0.75\u3002"]}, {"cell_type": "code", "execution_count": 12, "id": "49fc0f8f", "metadata": {"height": 30, "tags": []}, "outputs": [{"data": {"text/plain": ["0.94406149366893"]}, "execution_count": 12, "metadata": {}, "output_type": "execute_result"}], "source": ["np.dot(embedding1_chinese, embedding2_chinese)"]}, {"cell_type": "code", "execution_count": 13, "id": "32a1fac7", "metadata": {"height": 30, "tags": []}, "outputs": [{"data": {"text/plain": ["0.792186975021313"]}, "execution_count": 13, "metadata": {}, "output_type": "execute_result"}], "source": ["np.dot(embedding1_chinese, embedding3_chinese)"]}, {"cell_type": "code", "execution_count": 14, "id": "9dd18328", "metadata": {"height": 30, "tags": []}, "outputs": [{"data": {"text/plain": ["0.7804109942586283"]}, "execution_count": 14, "metadata": {}, "output_type": "execute_result"}], "source": ["np.dot(embedding2_chinese, embedding3_chinese)"]}, {"cell_type": "markdown", "id": "9a17a527", "metadata": {}, "source": ["\u6211\u4eec\u53ef\u4ee5\u770b\u5230\u524d\u4e24\u4e2a`embedding`\u7684\u5206\u6570\u76f8\u5f53\u9ad8\uff0c\u4e3a0.94\u3002\n", "\n", "\u5982\u679c\u6211\u4eec\u5c06\u7b2c\u4e00\u4e2a`embedding`\u4e0e\u7b2c\u4e09\u4e2a`embedding`\u8fdb\u884c\u6bd4\u8f83\uff0c\u6211\u4eec\u53ef\u4ee5\u770b\u5230\u5b83\u660e\u663e\u8f83\u4f4e\uff0c\u7ea6\u4e3a0.79\u3002\n", "\n", "\u5982\u679c\u6211\u4eec\u5c06\u7b2c\u4e8c\u4e2a`embedding`\u548c\u7b2c\u4e09\u4e2a`embedding`\u8fdb\u884c\u6bd4\u8f83\uff0c\u6211\u4eec\u53ef\u4ee5\u770b\u5230\u5b83\u7684\u5206\u6570\u5927\u7ea6\u4e3a0.78\u3002"]}, {"cell_type": "markdown", "id": "4fc7b24f", "metadata": {}, "source": ["## \u56db\u3001Vectorstores"]}, {"cell_type": "markdown", "id": "54776973", "metadata": {}, "source": ["### 4.1 \u521d\u59cb\u5316Chroma"]}, {"cell_type": "markdown", "id": "20916c7c", "metadata": {}, "source": ["Langchain\u96c6\u6210\u4e86\u8d85\u8fc730\u4e2a\u4e0d\u540c\u7684\u5411\u91cf\u5b58\u50a8\u5e93\u3002\u6211\u4eec\u9009\u62e9Chroma\u662f\u56e0\u4e3a\u5b83\u8f7b\u91cf\u7ea7\u4e14\u6570\u636e\u5b58\u50a8\u5728\u5185\u5b58\u4e2d\uff0c\u8fd9\u4f7f\u5f97\u5b83\u975e\u5e38\u5bb9\u6613\u542f\u52a8\u548c\u5f00\u59cb\u4f7f\u7528\u3002"]}, {"cell_type": "code", "execution_count": 24, "id": "201e6afa", "metadata": {"height": 30, "tags": []}, "outputs": [], "source": ["from langchain.vectorstores import Chroma"]}, {"cell_type": "code", "execution_count": 28, "id": "93960ac5", "metadata": {"height": 30, "tags": []}, "outputs": [], "source": ["persist_directory = 'docs/chroma/cs229_lectures/'"]}, {"cell_type": "code", "execution_count": null, "id": "a195e72a", "metadata": {"height": 30, "tags": []}, "outputs": [], "source": ["!rm -rf './docs/chroma/cs229_lectures' # \u5220\u9664\u65e7\u7684\u6570\u636e\u5e93\u6587\u4ef6\uff08\u5982\u679c\u6587\u4ef6\u5939\u4e2d\u6709\u6587\u4ef6\u7684\u8bdd\uff09\uff0cwindow\u7535\u8111\u8bf7\u624b\u52a8\u5220\u9664"]}, {"cell_type": "code", "execution_count": 29, "id": "690efd0a", "metadata": {"height": 98, "tags": []}, "outputs": [], "source": ["vectordb = Chroma.from_documents(\n", " documents=splits,\n", " embedding=embedding,\n", " persist_directory=persist_directory # \u5141\u8bb8\u6211\u4eec\u5c06persist_directory\u76ee\u5f55\u4fdd\u5b58\u5230\u78c1\u76d8\u4e0a\n", ")"]}, {"cell_type": "code", "execution_count": 30, "id": "f777480c", "metadata": {"height": 30, "tags": []}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["209\n"]}], "source": ["print(vectordb._collection.count())"]}, {"cell_type": "code", "execution_count": 66, "id": "d720e2b7", "metadata": {}, "outputs": [], "source": ["persist_directory_chinese = 'docs/chroma/matplotlib/'"]}, {"cell_type": "code", "execution_count": null, "id": "570a2768", "metadata": {}, "outputs": [], "source": ["!rm -rf './docs/chroma/matplotlib' # \u5220\u9664\u65e7\u7684\u6570\u636e\u5e93\u6587\u4ef6\uff08\u5982\u679c\u6587\u4ef6\u5939\u4e2d\u6709\u6587\u4ef6\u7684\u8bdd\uff09"]}, {"cell_type": "code", "execution_count": 67, "id": "1e03438d", "metadata": {}, "outputs": [], "source": ["vectordb_chinese = Chroma.from_documents(\n", " documents=splits_chinese,\n", " embedding=embedding,\n", " persist_directory=persist_directory_chinese # \u5141\u8bb8\u6211\u4eec\u5c06persist_directory\u76ee\u5f55\u4fdd\u5b58\u5230\u78c1\u76d8\u4e0a\n", ")"]}, {"cell_type": "code", "execution_count": 68, "id": "cf87aba0", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["27\n"]}], "source": ["print(vectordb_chinese._collection.count())"]}, {"cell_type": "markdown", "id": "9f78f412", "metadata": {}, "source": ["\u6211\u4eec\u53ef\u4ee5\u770b\u5230\u82f1\u6587\u7248\u7684\u957f\u5ea6\u4e5f\u662f209\u3001\u4e2d\u6587\u7248\u7684\u957f\u5ea6\u4e5f\u662f30\uff0c\u8fd9\u4e0e\u6211\u4eec\u4e4b\u524d\u7684\u5207\u5206\u6570\u91cf\u662f\u4e00\u6837\u7684\u3002\u73b0\u5728\u8ba9\u6211\u4eec\u5f00\u59cb\u4f7f\u7528\u5b83\u3002"]}, {"cell_type": "markdown", "id": "efca7589", "metadata": {}, "source": ["### 4.2 \u76f8\u4f3c\u6027\u641c\u7d22(Similarity Search)"]}, {"cell_type": "code", "execution_count": 31, "id": "3e20837d", "metadata": {"height": 30, "tags": []}, "outputs": [], "source": ["question = \"is there an email i can ask for help\" # \"\u6709\u6211\u53ef\u4ee5\u5bfb\u6c42\u5e2e\u52a9\u7684\u7535\u5b50\u90ae\u4ef6\u5417\""]}, {"cell_type": "code", "execution_count": 32, "id": "f9bde572", "metadata": {"height": 30, "tags": []}, "outputs": [], "source": ["docs = vectordb.similarity_search(question,k=3)"]}, {"cell_type": "code", "execution_count": 33, "id": "41388af1", "metadata": {"height": 30, "tags": []}, "outputs": [{"data": {"text/plain": ["3"]}, "execution_count": 33, "metadata": {}, "output_type": "execute_result"}], "source": ["len(docs)"]}, {"cell_type": "code", "execution_count": 34, "id": "183434f6", "metadata": {"height": 30, "tags": []}, "outputs": [{"data": {"text/plain": ["\"cs229-qa@cs.stanford.edu. This goes to an acc ount that's read by all the TAs and me. So \\nrather than sending us email individually, if you send email to this account, it will \\nactually let us get back to you maximally quickly with answers to your questions. \\nIf you're asking questions about homework probl ems, please say in the subject line which \\nassignment and which question the email refers to, since that will also help us to route \\nyour question to the appropriate TA or to me appropriately and get the response back to \\nyou quickly. \\nLet's see. Skipping ahead \u2014 let's see \u2014 for homework, one midterm, one open and term \\nproject. Notice on the honor code. So one thi ng that I think will help you to succeed and \\ndo well in this class and even help you to enjoy this cla ss more is if you form a study \\ngroup. \\nSo start looking around where you' re sitting now or at the end of class today, mingle a \\nlittle bit and get to know your classmates. I strongly encourage you to form study groups \\nand sort of have a group of people to study with and have a group of your fellow students \\nto talk over these concepts with. You can also post on the class news group if you want to \\nuse that to try to form a study group. \\nBut some of the problems sets in this cla ss are reasonably difficult. People that have \\ntaken the class before may tell you they were very difficult. And just I bet it would be \\nmore fun for you, and you'd probably have a be tter learning experience if you form a\""]}, "execution_count": 34, "metadata": {}, "output_type": "execute_result"}], "source": ["docs[0].page_content"]}, {"cell_type": "markdown", "id": "1edb21d8", "metadata": {}, "source": ["\u5982\u679c\u6211\u4eec\u67e5\u770b\u7b2c\u4e00\u4e2a\u6587\u6863\u7684\u5185\u5bb9\uff0c\u6211\u4eec\u53ef\u4ee5\u770b\u5230\u5b83\u5b9e\u9645\u4e0a\u662f\u5173\u4e8e\u4e00\u4e2a\u7535\u5b50\u90ae\u4ef6\u5730\u5740\uff0ccs229-qa@cs.stanford.edu\u3002\n", "\n", "\u8fd9\u662f\u6211\u4eec\u53ef\u4ee5\u5411\u5176\u53d1\u9001\u95ee\u9898\u7684\u7535\u5b50\u90ae\u4ef6\uff0c\u6240\u6709\u7684\u52a9\u6559\u90fd\u4f1a\u9605\u8bfb\u8fd9\u4e9b\u90ae\u4ef6\u3002"]}, {"cell_type": "code", "execution_count": 69, "id": "53bcc061", "metadata": {}, "outputs": [], "source": ["question_chinese = \"Matplotlib\u662f\u4ec0\u4e48\uff1f\" "]}, {"cell_type": "code", "execution_count": 70, "id": "2d2dc834", "metadata": {}, "outputs": [], "source": ["docs_chinese = vectordb_chinese.similarity_search(question_chinese,k=3)"]}, {"cell_type": "code", "execution_count": 71, "id": "eebe77e7", "metadata": {}, "outputs": [{"data": {"text/plain": ["3"]}, "execution_count": 71, "metadata": {}, "output_type": "execute_result"}], "source": ["len(docs_chinese)"]}, {"cell_type": "code", "execution_count": 72, "id": "860af154", "metadata": {}, "outputs": [{"data": {"text/plain": ["'\u7b2c\u2f00\u56de\uff1aMatplotlib \u521d\u76f8\u8bc6\\n\u2f00\u3001\u8ba4\u8bc6matplotlib\\nMatplotlib \u662f\u2f00\u4e2a Python 2D \u7ed8\u56fe\u5e93\uff0c\u80fd\u591f\u4ee5\u591a\u79cd\u786c\u62f7\u2ec9\u683c\u5f0f\u548c\u8de8\u5e73\u53f0\u7684\u4ea4\u4e92\u5f0f\u73af\u5883\u2f63\u6210\u51fa\u7248\u7269\u8d28\u91cf\u7684\u56fe\u5f62\uff0c\u2f64\u6765\u7ed8\u5236\u5404\u79cd\u9759\u6001\uff0c\u52a8\u6001\uff0c\\n\u4ea4\u4e92\u5f0f\u7684\u56fe\u8868\u3002\\nMatplotlib \u53ef\u2f64\u4e8e Python \u811a\u672c\uff0c Python \u548c IPython Shell \u3001 Jupyter notebook \uff0c Web \u5e94\u2f64\u7a0b\u5e8f\u670d\u52a1\u5668\u548c\u5404\u79cd\u56fe\u5f62\u2f64\u6237\u754c\u2faf\u2f2f\u5177\u5305\u7b49\u3002\\nMatplotlib \u662f Python \u6570\u636e\u53ef\u89c6\u5316\u5e93\u4e2d\u7684\u6cf0\u2f43\uff0c\u5b83\u5df2\u7ecf\u6210\u4e3a python \u4e2d\u516c\u8ba4\u7684\u6570\u636e\u53ef\u89c6\u5316\u2f2f\u5177\uff0c\u6211\u4eec\u6240\u719f\u77e5\u7684 pandas \u548c seaborn \u7684\u7ed8\u56fe\u63a5\u2f1d\\n\u5176\u5b9e\u4e5f\u662f\u57fa\u4e8e matplotlib \u6240\u4f5c\u7684\u2fbc\u7ea7\u5c01\u88c5\u3002\\n\u4e3a\u4e86\u5bf9matplotlib \u6709\u66f4\u597d\u7684\u7406\u89e3\uff0c\u8ba9\u6211\u4eec\u4ece\u2f00\u4e9b\u6700\u57fa\u672c\u7684\u6982\u5ff5\u5f00\u59cb\u8ba4\u8bc6\u5b83\uff0c\u518d\u9010\u6e10\u8fc7\u6e21\u5230\u2f00\u4e9b\u2fbc\u7ea7\u6280\u5de7\u4e2d\u3002\\n\u2f06\u3001\u2f00\u4e2a\u6700\u7b80\u5355\u7684\u7ed8\u56fe\u4f8b\u2f26\\nMatplotlib \u7684\u56fe\u50cf\u662f\u753b\u5728 figure \uff08\u5982 windows \uff0c jupyter \u7a97\u4f53\uff09\u4e0a\u7684\uff0c\u6bcf\u2f00\u4e2a figure \u2f1c\u5305\u542b\u4e86\u2f00\u4e2a\u6216\u591a\u4e2a axes \uff08\u2f00\u4e2a\u53ef\u4ee5\u6307\u5b9a\u5750\u6807\u7cfb\u7684\u2f26\u533a\\n\u57df\uff09\u3002\u6700\u7b80\u5355\u7684\u521b\u5efa figure \u4ee5\u53ca axes \u7684\u2f45\u5f0f\u662f\u901a\u8fc7 pyplot.subplots\u547d\u4ee4\uff0c\u521b\u5efa axes \u4ee5\u540e\uff0c\u53ef\u4ee5\u4f7f\u2f64 Axes.plot\u7ed8\u5236\u6700\u7b80\u6613\u7684\u6298\u7ebf\u56fe\u3002\\nimport matplotlib.pyplot as plt\\nimport matplotlib as mpl\\nimport numpy as np\\nfig, ax = plt.subplots() # \u521b\u5efa\u2f00\u4e2a\u5305\u542b\u2f00\u4e2a axes \u7684 figure\\nax.plot([1, 2, 3, 4], [1, 4, 2, 3]); # \u7ed8\u5236\u56fe\u50cf\\nTrick\uff1a \u5728jupyter notebook \u4e2d\u4f7f\u2f64 matplotlib \u65f6\u4f1a\u53d1\u73b0\uff0c\u4ee3\u7801\u8fd0\u2f8f\u540e\u2f83\u52a8\u6253\u5370\u51fa\u7c7b\u4f3c \\n\u8fd9\u6837\u2f00\u6bb5\u8bdd\uff0c\u8fd9\u662f\u56e0\u4e3a matplotlib \u7684\u7ed8\u56fe\u4ee3\u7801\u9ed8\u8ba4\u6253\u5370\u51fa\u6700\u540e\u2f00\u4e2a\u5bf9\u8c61\u3002\u5982\u679c\u4e0d\u60f3\u663e\u793a\u8fd9\u53e5\u8bdd\uff0c\u6709\u4ee5\u4e0b\u4e09\u79cd\u2f45\u6cd5\uff0c\u5728\u672c\u7ae0\u8282\u7684\u4ee3\u7801\u793a\u4f8b\\n\u4e2d\u4f60\u80fd\u627e\u5230\u8fd9\u4e09\u79cd\u2f45\u6cd5\u7684\u4f7f\u2f64\u3002\\n\\x00. \u5728\u4ee3\u7801\u5757\u6700\u540e\u52a0\u2f00\u4e2a\u5206\u53f7 ;\\n\\x00. \u5728\u4ee3\u7801\u5757\u6700\u540e\u52a0\u2f00\u53e5 plt.show()\\n\\x00. \u5728\u7ed8\u56fe\u65f6\u5c06\u7ed8\u56fe\u5bf9\u8c61\u663e\u5f0f\u8d4b\u503c\u7ed9\u2f00\u4e2a\u53d8\u91cf\uff0c\u5982\u5c06 plt.plot([1, 2, 3, 4]) \u6539\u6210 line =plt.plot([1, 2, 3, 4])\\n\u548cMATLAB \u547d\u4ee4\u7c7b\u4f3c\uff0c\u4f60\u8fd8\u53ef\u4ee5\u901a\u8fc7\u2f00\u79cd\u66f4\u7b80\u5355\u7684\u2f45\u5f0f\u7ed8\u5236\u56fe\u50cf\uff0c matplotlib.pyplot\u2f45\u6cd5\u80fd\u591f\u76f4\u63a5\u5728\u5f53\u524d axes \u4e0a\u7ed8\u5236\u56fe\u50cf\uff0c\u5982\u679c\u2f64\u6237\\n\u672a\u6307\u5b9aaxes \uff0c matplotlib \u4f1a\u5e2e\u4f60\u2f83\u52a8\u521b\u5efa\u2f00\u4e2a\u3002\u6240\u4ee5\u4e0a\u2faf\u7684\u4f8b\u2f26\u4e5f\u53ef\u4ee5\u7b80\u5316\u4e3a\u4ee5\u4e0b\u8fd9\u2f00\u2f8f\u4ee3\u7801\u3002\\nline =plt.plot([1, 2, 3, 4], [1, 4, 2, 3]) \\n\u4e09\u3001Figure \u7684\u7ec4\u6210\\n\u73b0\u5728\u6211\u4eec\u6765\u6df1\u2f0a\u770b\u2f00\u4e0b figure \u7684\u7ec4\u6210\u3002\u901a\u8fc7\u2f00\u5f20 figure \u89e3\u5256\u56fe\uff0c\u6211\u4eec\u53ef\u4ee5\u770b\u5230\u2f00\u4e2a\u5b8c\u6574\u7684 matplotlib \u56fe\u50cf\u901a\u5e38\u4f1a\u5305\u62ec\u4ee5\u4e0b\u56db\u4e2a\u5c42\u7ea7\uff0c\u8fd9\u4e9b\\n\u5c42\u7ea7\u4e5f\u88ab\u79f0\u4e3a\u5bb9\u5668\uff08 container \uff09\uff0c\u4e0b\u2f00\u8282\u4f1a\u8be6\u7ec6\u4ecb\u7ecd\u3002\u5728 matplotlib \u7684\u4e16\u754c\u4e2d\uff0c\u6211\u4eec\u5c06\u901a\u8fc7\u5404\u79cd\u547d\u4ee4\u2f45\u6cd5\u6765\u64cd\u7eb5\u56fe\u50cf\u4e2d\u7684\u6bcf\u2f00\u4e2a\u90e8\u5206\uff0c\\n\u4ece\u2f7d\u8fbe\u5230\u6570\u636e\u53ef\u89c6\u5316\u7684\u6700\u7ec8\u6548\u679c\uff0c\u2f00\u526f\u5b8c\u6574\u7684\u56fe\u50cf\u5b9e\u9645\u4e0a\u662f\u5404\u7c7b\u2f26\u5143\u7d20\u7684\u96c6\u5408\u3002\\nFigure\uff1a\u9876\u5c42\u7ea7\uff0c\u2f64\u6765\u5bb9\u7eb3\u6240\u6709\u7ed8\u56fe\u5143\u7d20'"]}, "execution_count": 72, "metadata": {}, "output_type": "execute_result"}], "source": ["docs_chinese[0].page_content"]}, {"cell_type": "markdown", "id": "60a720a6", "metadata": {}, "source": ["\u5982\u679c\u6211\u4eec\u67e5\u770b\u7b2c\u4e00\u4e2a\u6587\u6863\u7684\u5185\u5bb9\uff0c\u6211\u4eec\u53ef\u4ee5\u770b\u5230\u5b83\u5b9e\u9645\u4e0a\u662f\u5173\u4e8eMatplotlib\u7684\u4ecb\u7ecd"]}, {"cell_type": "markdown", "id": "fba57a1d", "metadata": {}, "source": ["\u5728\u6b64\u4e4b\u540e\uff0c\u6211\u4eec\u8981\u786e\u4fdd\u901a\u8fc7\u8fd0\u884cvectordb.persist\u6765\u6301\u4e45\u5316\u5411\u91cf\u6570\u636e\u5e93\uff0c\u4ee5\u4fbf\u6211\u4eec\u5728\u672a\u6765\u7684\u8bfe\u7a0b\u4e2d\u4f7f\u7528\u3002\n", "\n", "\u8ba9\u6211\u4eec\u4fdd\u5b58\u5b83\uff0c\u4ee5\u4fbf\u4ee5\u540e\u4f7f\u7528\uff01"]}, {"cell_type": "code", "execution_count": 44, "id": "ea657123", "metadata": {"height": 30, "tags": []}, "outputs": [], "source": ["vectordb.persist()"]}, {"cell_type": "code", "execution_count": 73, "id": "9065f5b1", "metadata": {}, "outputs": [], "source": ["vectordb_chinese.persist()"]}, {"cell_type": "markdown", "id": "cefe9f6a", "metadata": {}, "source": ["## \u4e94\u3001\u5931\u8d25\u7684\u60c5\u51b5(Failure modes)"]}, {"cell_type": "markdown", "id": "089da0ca", "metadata": {}, "source": ["\u8fd9\u770b\u8d77\u6765\u5f88\u597d\uff0c\u57fa\u672c\u7684\u76f8\u4f3c\u6027\u641c\u7d22\u5f88\u5bb9\u6613\u5c31\u80fd\u8ba9\u4f60\u5b8c\u621080%\u7684\u5de5\u4f5c\u3002\n", "\n", "\u4f46\u662f\uff0c\u53ef\u80fd\u4f1a\u51fa\u73b0\u4e00\u4e9b\u76f8\u4f3c\u6027\u641c\u7d22\u5931\u8d25\u7684\u60c5\u51b5\u3002\n", "\n", "\u8fd9\u91cc\u6709\u4e00\u4e9b\u53ef\u80fd\u51fa\u73b0\u7684\u8fb9\u7f18\u60c5\u51b5\u2014\u2014\u6211\u4eec\u5c06\u5728\u4e0b\u4e00\u5802\u8bfe\u4e2d\u4fee\u590d\u5b83\u4eec\u3002"]}, {"cell_type": "code", "execution_count": 36, "id": "df0f29f9", "metadata": {"height": 30, "tags": []}, "outputs": [], "source": ["question = \"what did they say about matlab?\" # \"\u4ed6\u4eec\u5bf9 matlab \u6709\u4f55\u8bc4\u4ef7\uff1f\""]}, {"cell_type": "code", "execution_count": 37, "id": "02be97df", "metadata": {"height": 30, "tags": []}, "outputs": [], "source": ["docs = vectordb.similarity_search(question,k=5)"]}, {"cell_type": "code", "execution_count": 51, "id": "34359d7a", "metadata": {}, "outputs": [], "source": ["question_chinese = \"Matplotlib\u662f\u4ec0\u4e48\uff1f\""]}, {"cell_type": "code", "execution_count": 52, "id": "c0546006", "metadata": {}, "outputs": [], "source": ["docs_chinese = vectordb_chinese.similarity_search(question_chinese,k=5)"]}, {"cell_type": "markdown", "id": "2a9f579e", "metadata": {}, "source": ["\u8bf7\u6ce8\u610f\uff0c\u6211\u4eec\u5f97\u5230\u4e86\u91cd\u590d\u7684\u5757\uff08\u56e0\u4e3a\u7d22\u5f15\u4e2d\u6709\u91cd\u590d\u7684 `MachineLearning-Lecture01.pdf`\u3001`\u7b2c\u4e00\u56de\uff1aMatplotlib\u521d\u76f8\u8bc6.pdf`\uff09\u3002\n", "\n", "\u8bed\u4e49\u641c\u7d22\u83b7\u53d6\u6240\u6709\u76f8\u4f3c\u7684\u6587\u6863\uff0c\u4f46\u4e0d\u5f3a\u5236\u591a\u6837\u6027\u3002\n", "\n", "`docs[0]` \u548c `docs[1]` \u662f\u5b8c\u5168\u76f8\u540c\u7684\uff0c\u4ee5\u53ca`docs_chinese[0]` \u548c `docs_chinese[1]` \u662f\u5b8c\u5168\u76f8\u540c\u7684\u3002"]}, {"cell_type": "code", "execution_count": 38, "id": "d39f6954", "metadata": {"height": 30, "tags": []}, "outputs": [{"data": {"text/plain": ["Document(page_content='those homeworks will be done in either MATLA B or in Octave, which is sort of \u2014 I \\nknow some people call it a free ve rsion of MATLAB, which it sort of is, sort of isn\\'t. \\nSo I guess for those of you that haven\\'t s een MATLAB before, and I know most of you \\nhave, MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to \\nplot data. And it\\'s sort of an extremely easy to learn tool to use for implementing a lot of \\nlearning algorithms. \\nAnd in case some of you want to work on your own home computer or something if you \\ndon\\'t have a MATLAB license, for the purposes of this class, there\\'s also \u2014 [inaudible] \\nwrite that down [inaudible] MATLAB \u2014 there\\' s also a software package called Octave \\nthat you can download for free off the Internet. And it has somewhat fewer features than MATLAB, but it\\'s free, and for the purposes of this class, it will work for just about \\neverything. \\nSo actually I, well, so yeah, just a side comment for those of you that haven\\'t seen \\nMATLAB before I guess, once a colleague of mine at a different university, not at \\nStanford, actually teaches another machine l earning course. He\\'s taught it for many years. \\nSo one day, he was in his office, and an old student of his from, lik e, ten years ago came \\ninto his office and he said, \"Oh, professo r, professor, thank you so much for your', metadata={'source': 'docs/cs229_lectures/MachineLearning-Lecture01.pdf', 'page': 8})"]}, "execution_count": 38, "metadata": {}, "output_type": "execute_result"}], "source": ["docs[0]"]}, {"cell_type": "code", "execution_count": 39, "id": "e04e3d1b", "metadata": {"height": 30, "tags": []}, "outputs": [{"data": {"text/plain": ["Document(page_content='those homeworks will be done in either MATLA B or in Octave, which is sort of \u2014 I \\nknow some people call it a free ve rsion of MATLAB, which it sort of is, sort of isn\\'t. \\nSo I guess for those of you that haven\\'t s een MATLAB before, and I know most of you \\nhave, MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to \\nplot data. And it\\'s sort of an extremely easy to learn tool to use for implementing a lot of \\nlearning algorithms. \\nAnd in case some of you want to work on your own home computer or something if you \\ndon\\'t have a MATLAB license, for the purposes of this class, there\\'s also \u2014 [inaudible] \\nwrite that down [inaudible] MATLAB \u2014 there\\' s also a software package called Octave \\nthat you can download for free off the Internet. And it has somewhat fewer features than MATLAB, but it\\'s free, and for the purposes of this class, it will work for just about \\neverything. \\nSo actually I, well, so yeah, just a side comment for those of you that haven\\'t seen \\nMATLAB before I guess, once a colleague of mine at a different university, not at \\nStanford, actually teaches another machine l earning course. He\\'s taught it for many years. \\nSo one day, he was in his office, and an old student of his from, lik e, ten years ago came \\ninto his office and he said, \"Oh, professo r, professor, thank you so much for your', metadata={'source': 'docs/cs229_lectures/MachineLearning-Lecture01.pdf', 'page': 8})"]}, "execution_count": 39, "metadata": {}, "output_type": "execute_result"}], "source": ["docs[1]"]}, {"cell_type": "code", "execution_count": 53, "id": "092fd2f5", "metadata": {}, "outputs": [{"data": {"text/plain": ["Document(page_content='\u7b2c\u2f00\u56de\uff1aMatplotlib \u521d\u76f8\u8bc6\\n\u2f00\u3001\u8ba4\u8bc6matplotlib\\nMatplotlib \u662f\u2f00\u4e2a Python 2D \u7ed8\u56fe\u5e93\uff0c\u80fd\u591f\u4ee5\u591a\u79cd\u786c\u62f7\u2ec9\u683c\u5f0f\u548c\u8de8\u5e73\u53f0\u7684\u4ea4\u4e92\u5f0f\u73af\u5883\u2f63\u6210\u51fa\u7248\u7269\u8d28\u91cf\u7684\u56fe\u5f62\uff0c\u2f64\u6765\u7ed8\u5236\u5404\u79cd\u9759\u6001\uff0c\u52a8\u6001\uff0c\\n\u4ea4\u4e92\u5f0f\u7684\u56fe\u8868\u3002\\nMatplotlib \u53ef\u2f64\u4e8e Python \u811a\u672c\uff0c Python \u548c IPython Shell \u3001 Jupyter notebook \uff0c Web \u5e94\u2f64\u7a0b\u5e8f\u670d\u52a1\u5668\u548c\u5404\u79cd\u56fe\u5f62\u2f64\u6237\u754c\u2faf\u2f2f\u5177\u5305\u7b49\u3002\\nMatplotlib \u662f Python \u6570\u636e\u53ef\u89c6\u5316\u5e93\u4e2d\u7684\u6cf0\u2f43\uff0c\u5b83\u5df2\u7ecf\u6210\u4e3a python \u4e2d\u516c\u8ba4\u7684\u6570\u636e\u53ef\u89c6\u5316\u2f2f\u5177\uff0c\u6211\u4eec\u6240\u719f\u77e5\u7684 pandas \u548c seaborn \u7684\u7ed8\u56fe\u63a5\u2f1d\\n\u5176\u5b9e\u4e5f\u662f\u57fa\u4e8e matplotlib \u6240\u4f5c\u7684\u2fbc\u7ea7\u5c01\u88c5\u3002\\n\u4e3a\u4e86\u5bf9matplotlib \u6709\u66f4\u597d\u7684\u7406\u89e3\uff0c\u8ba9\u6211\u4eec\u4ece\u2f00\u4e9b\u6700\u57fa\u672c\u7684\u6982\u5ff5\u5f00\u59cb\u8ba4\u8bc6\u5b83\uff0c\u518d\u9010\u6e10\u8fc7\u6e21\u5230\u2f00\u4e9b\u2fbc\u7ea7\u6280\u5de7\u4e2d\u3002\\n\u2f06\u3001\u2f00\u4e2a\u6700\u7b80\u5355\u7684\u7ed8\u56fe\u4f8b\u2f26\\nMatplotlib \u7684\u56fe\u50cf\u662f\u753b\u5728 figure \uff08\u5982 windows \uff0c jupyter \u7a97\u4f53\uff09\u4e0a\u7684\uff0c\u6bcf\u2f00\u4e2a figure \u2f1c\u5305\u542b\u4e86\u2f00\u4e2a\u6216\u591a\u4e2a axes \uff08\u2f00\u4e2a\u53ef\u4ee5\u6307\u5b9a\u5750\u6807\u7cfb\u7684\u2f26\u533a\\n\u57df\uff09\u3002\u6700\u7b80\u5355\u7684\u521b\u5efa figure \u4ee5\u53ca axes \u7684\u2f45\u5f0f\u662f\u901a\u8fc7 pyplot.subplots\u547d\u4ee4\uff0c\u521b\u5efa axes \u4ee5\u540e\uff0c\u53ef\u4ee5\u4f7f\u2f64 Axes.plot\u7ed8\u5236\u6700\u7b80\u6613\u7684\u6298\u7ebf\u56fe\u3002\\nimport matplotlib.pyplot as plt\\nimport matplotlib as mpl\\nimport numpy as np\\nfig, ax = plt.subplots() # \u521b\u5efa\u2f00\u4e2a\u5305\u542b\u2f00\u4e2a axes \u7684 figure\\nax.plot([1, 2, 3, 4], [1, 4, 2, 3]); # \u7ed8\u5236\u56fe\u50cf\\nTrick\uff1a \u5728jupyter notebook \u4e2d\u4f7f\u2f64 matplotlib \u65f6\u4f1a\u53d1\u73b0\uff0c\u4ee3\u7801\u8fd0\u2f8f\u540e\u2f83\u52a8\u6253\u5370\u51fa\u7c7b\u4f3c \\n\u8fd9\u6837\u2f00\u6bb5\u8bdd\uff0c\u8fd9\u662f\u56e0\u4e3a matplotlib \u7684\u7ed8\u56fe\u4ee3\u7801\u9ed8\u8ba4\u6253\u5370\u51fa\u6700\u540e\u2f00\u4e2a\u5bf9\u8c61\u3002\u5982\u679c\u4e0d\u60f3\u663e\u793a\u8fd9\u53e5\u8bdd\uff0c\u6709\u4ee5\u4e0b\u4e09\u79cd\u2f45\u6cd5\uff0c\u5728\u672c\u7ae0\u8282\u7684\u4ee3\u7801\u793a\u4f8b\\n\u4e2d\u4f60\u80fd\u627e\u5230\u8fd9\u4e09\u79cd\u2f45\u6cd5\u7684\u4f7f\u2f64\u3002\\n\\x00. \u5728\u4ee3\u7801\u5757\u6700\u540e\u52a0\u2f00\u4e2a\u5206\u53f7 ;\\n\\x00. \u5728\u4ee3\u7801\u5757\u6700\u540e\u52a0\u2f00\u53e5 plt.show()\\n\\x00. \u5728\u7ed8\u56fe\u65f6\u5c06\u7ed8\u56fe\u5bf9\u8c61\u663e\u5f0f\u8d4b\u503c\u7ed9\u2f00\u4e2a\u53d8\u91cf\uff0c\u5982\u5c06 plt.plot([1, 2, 3, 4]) \u6539\u6210 line =plt.plot([1, 2, 3, 4])\\n\u548cMATLAB \u547d\u4ee4\u7c7b\u4f3c\uff0c\u4f60\u8fd8\u53ef\u4ee5\u901a\u8fc7\u2f00\u79cd\u66f4\u7b80\u5355\u7684\u2f45\u5f0f\u7ed8\u5236\u56fe\u50cf\uff0c matplotlib.pyplot\u2f45\u6cd5\u80fd\u591f\u76f4\u63a5\u5728\u5f53\u524d axes \u4e0a\u7ed8\u5236\u56fe\u50cf\uff0c\u5982\u679c\u2f64\u6237\\n\u672a\u6307\u5b9aaxes \uff0c matplotlib \u4f1a\u5e2e\u4f60\u2f83\u52a8\u521b\u5efa\u2f00\u4e2a\u3002\u6240\u4ee5\u4e0a\u2faf\u7684\u4f8b\u2f26\u4e5f\u53ef\u4ee5\u7b80\u5316\u4e3a\u4ee5\u4e0b\u8fd9\u2f00\u2f8f\u4ee3\u7801\u3002\\nline =plt.plot([1, 2, 3, 4], [1, 4, 2, 3]) \\n\u4e09\u3001Figure \u7684\u7ec4\u6210\\n\u73b0\u5728\u6211\u4eec\u6765\u6df1\u2f0a\u770b\u2f00\u4e0b figure \u7684\u7ec4\u6210\u3002\u901a\u8fc7\u2f00\u5f20 figure \u89e3\u5256\u56fe\uff0c\u6211\u4eec\u53ef\u4ee5\u770b\u5230\u2f00\u4e2a\u5b8c\u6574\u7684 matplotlib \u56fe\u50cf\u901a\u5e38\u4f1a\u5305\u62ec\u4ee5\u4e0b\u56db\u4e2a\u5c42\u7ea7\uff0c\u8fd9\u4e9b\\n\u5c42\u7ea7\u4e5f\u88ab\u79f0\u4e3a\u5bb9\u5668\uff08 container \uff09\uff0c\u4e0b\u2f00\u8282\u4f1a\u8be6\u7ec6\u4ecb\u7ecd\u3002\u5728 matplotlib \u7684\u4e16\u754c\u4e2d\uff0c\u6211\u4eec\u5c06\u901a\u8fc7\u5404\u79cd\u547d\u4ee4\u2f45\u6cd5\u6765\u64cd\u7eb5\u56fe\u50cf\u4e2d\u7684\u6bcf\u2f00\u4e2a\u90e8\u5206\uff0c\\n\u4ece\u2f7d\u8fbe\u5230\u6570\u636e\u53ef\u89c6\u5316\u7684\u6700\u7ec8\u6548\u679c\uff0c\u2f00\u526f\u5b8c\u6574\u7684\u56fe\u50cf\u5b9e\u9645\u4e0a\u662f\u5404\u7c7b\u2f26\u5143\u7d20\u7684\u96c6\u5408\u3002\\nFigure\uff1a\u9876\u5c42\u7ea7\uff0c\u2f64\u6765\u5bb9\u7eb3\u6240\u6709\u7ed8\u56fe\u5143\u7d20\\uf03a Contents \\n\u2f00\u3001\u8ba4\u8bc6matplotlib\\n\u2f06\u3001\u2f00\u4e2a\u6700\u7b80\u5355\u7684\u7ed8\u56fe\u4f8b\u2f26\\n\u4e09\u3001Figure \u7684\u7ec4\u6210', metadata={'source': 'docs/matplotlib/\u7b2c\u4e00\u56de\uff1aMatplotlib\u521d\u76f8\u8bc6.pdf', 'page': 0})"]}, "execution_count": 53, "metadata": {}, "output_type": "execute_result"}], "source": ["docs_chinese[0]"]}, {"cell_type": "code", "execution_count": 54, "id": "270f0a10", "metadata": {}, "outputs": [{"data": {"text/plain": ["Document(page_content='\u7b2c\u2f00\u56de\uff1aMatplotlib \u521d\u76f8\u8bc6\\n\u2f00\u3001\u8ba4\u8bc6matplotlib\\nMatplotlib \u662f\u2f00\u4e2a Python 2D \u7ed8\u56fe\u5e93\uff0c\u80fd\u591f\u4ee5\u591a\u79cd\u786c\u62f7\u2ec9\u683c\u5f0f\u548c\u8de8\u5e73\u53f0\u7684\u4ea4\u4e92\u5f0f\u73af\u5883\u2f63\u6210\u51fa\u7248\u7269\u8d28\u91cf\u7684\u56fe\u5f62\uff0c\u2f64\u6765\u7ed8\u5236\u5404\u79cd\u9759\u6001\uff0c\u52a8\u6001\uff0c\\n\u4ea4\u4e92\u5f0f\u7684\u56fe\u8868\u3002\\nMatplotlib \u53ef\u2f64\u4e8e Python \u811a\u672c\uff0c Python \u548c IPython Shell \u3001 Jupyter notebook \uff0c Web \u5e94\u2f64\u7a0b\u5e8f\u670d\u52a1\u5668\u548c\u5404\u79cd\u56fe\u5f62\u2f64\u6237\u754c\u2faf\u2f2f\u5177\u5305\u7b49\u3002\\nMatplotlib \u662f Python \u6570\u636e\u53ef\u89c6\u5316\u5e93\u4e2d\u7684\u6cf0\u2f43\uff0c\u5b83\u5df2\u7ecf\u6210\u4e3a python \u4e2d\u516c\u8ba4\u7684\u6570\u636e\u53ef\u89c6\u5316\u2f2f\u5177\uff0c\u6211\u4eec\u6240\u719f\u77e5\u7684 pandas \u548c seaborn \u7684\u7ed8\u56fe\u63a5\u2f1d\\n\u5176\u5b9e\u4e5f\u662f\u57fa\u4e8e matplotlib \u6240\u4f5c\u7684\u2fbc\u7ea7\u5c01\u88c5\u3002\\n\u4e3a\u4e86\u5bf9matplotlib \u6709\u66f4\u597d\u7684\u7406\u89e3\uff0c\u8ba9\u6211\u4eec\u4ece\u2f00\u4e9b\u6700\u57fa\u672c\u7684\u6982\u5ff5\u5f00\u59cb\u8ba4\u8bc6\u5b83\uff0c\u518d\u9010\u6e10\u8fc7\u6e21\u5230\u2f00\u4e9b\u2fbc\u7ea7\u6280\u5de7\u4e2d\u3002\\n\u2f06\u3001\u2f00\u4e2a\u6700\u7b80\u5355\u7684\u7ed8\u56fe\u4f8b\u2f26\\nMatplotlib \u7684\u56fe\u50cf\u662f\u753b\u5728 figure \uff08\u5982 windows \uff0c jupyter \u7a97\u4f53\uff09\u4e0a\u7684\uff0c\u6bcf\u2f00\u4e2a figure \u2f1c\u5305\u542b\u4e86\u2f00\u4e2a\u6216\u591a\u4e2a axes \uff08\u2f00\u4e2a\u53ef\u4ee5\u6307\u5b9a\u5750\u6807\u7cfb\u7684\u2f26\u533a\\n\u57df\uff09\u3002\u6700\u7b80\u5355\u7684\u521b\u5efa figure \u4ee5\u53ca axes \u7684\u2f45\u5f0f\u662f\u901a\u8fc7 pyplot.subplots\u547d\u4ee4\uff0c\u521b\u5efa axes \u4ee5\u540e\uff0c\u53ef\u4ee5\u4f7f\u2f64 Axes.plot\u7ed8\u5236\u6700\u7b80\u6613\u7684\u6298\u7ebf\u56fe\u3002\\nimport matplotlib.pyplot as plt\\nimport matplotlib as mpl\\nimport numpy as np\\nfig, ax = plt.subplots() # \u521b\u5efa\u2f00\u4e2a\u5305\u542b\u2f00\u4e2a axes \u7684 figure\\nax.plot([1, 2, 3, 4], [1, 4, 2, 3]); # \u7ed8\u5236\u56fe\u50cf\\nTrick\uff1a \u5728jupyter notebook \u4e2d\u4f7f\u2f64 matplotlib \u65f6\u4f1a\u53d1\u73b0\uff0c\u4ee3\u7801\u8fd0\u2f8f\u540e\u2f83\u52a8\u6253\u5370\u51fa\u7c7b\u4f3c \\n\u8fd9\u6837\u2f00\u6bb5\u8bdd\uff0c\u8fd9\u662f\u56e0\u4e3a matplotlib \u7684\u7ed8\u56fe\u4ee3\u7801\u9ed8\u8ba4\u6253\u5370\u51fa\u6700\u540e\u2f00\u4e2a\u5bf9\u8c61\u3002\u5982\u679c\u4e0d\u60f3\u663e\u793a\u8fd9\u53e5\u8bdd\uff0c\u6709\u4ee5\u4e0b\u4e09\u79cd\u2f45\u6cd5\uff0c\u5728\u672c\u7ae0\u8282\u7684\u4ee3\u7801\u793a\u4f8b\\n\u4e2d\u4f60\u80fd\u627e\u5230\u8fd9\u4e09\u79cd\u2f45\u6cd5\u7684\u4f7f\u2f64\u3002\\n\\x00. \u5728\u4ee3\u7801\u5757\u6700\u540e\u52a0\u2f00\u4e2a\u5206\u53f7 ;\\n\\x00. \u5728\u4ee3\u7801\u5757\u6700\u540e\u52a0\u2f00\u53e5 plt.show()\\n\\x00. \u5728\u7ed8\u56fe\u65f6\u5c06\u7ed8\u56fe\u5bf9\u8c61\u663e\u5f0f\u8d4b\u503c\u7ed9\u2f00\u4e2a\u53d8\u91cf\uff0c\u5982\u5c06 plt.plot([1, 2, 3, 4]) \u6539\u6210 line =plt.plot([1, 2, 3, 4])\\n\u548cMATLAB \u547d\u4ee4\u7c7b\u4f3c\uff0c\u4f60\u8fd8\u53ef\u4ee5\u901a\u8fc7\u2f00\u79cd\u66f4\u7b80\u5355\u7684\u2f45\u5f0f\u7ed8\u5236\u56fe\u50cf\uff0c matplotlib.pyplot\u2f45\u6cd5\u80fd\u591f\u76f4\u63a5\u5728\u5f53\u524d axes \u4e0a\u7ed8\u5236\u56fe\u50cf\uff0c\u5982\u679c\u2f64\u6237\\n\u672a\u6307\u5b9aaxes \uff0c matplotlib \u4f1a\u5e2e\u4f60\u2f83\u52a8\u521b\u5efa\u2f00\u4e2a\u3002\u6240\u4ee5\u4e0a\u2faf\u7684\u4f8b\u2f26\u4e5f\u53ef\u4ee5\u7b80\u5316\u4e3a\u4ee5\u4e0b\u8fd9\u2f00\u2f8f\u4ee3\u7801\u3002\\nline =plt.plot([1, 2, 3, 4], [1, 4, 2, 3]) \\n\u4e09\u3001Figure \u7684\u7ec4\u6210\\n\u73b0\u5728\u6211\u4eec\u6765\u6df1\u2f0a\u770b\u2f00\u4e0b figure \u7684\u7ec4\u6210\u3002\u901a\u8fc7\u2f00\u5f20 figure \u89e3\u5256\u56fe\uff0c\u6211\u4eec\u53ef\u4ee5\u770b\u5230\u2f00\u4e2a\u5b8c\u6574\u7684 matplotlib \u56fe\u50cf\u901a\u5e38\u4f1a\u5305\u62ec\u4ee5\u4e0b\u56db\u4e2a\u5c42\u7ea7\uff0c\u8fd9\u4e9b\\n\u5c42\u7ea7\u4e5f\u88ab\u79f0\u4e3a\u5bb9\u5668\uff08 container \uff09\uff0c\u4e0b\u2f00\u8282\u4f1a\u8be6\u7ec6\u4ecb\u7ecd\u3002\u5728 matplotlib \u7684\u4e16\u754c\u4e2d\uff0c\u6211\u4eec\u5c06\u901a\u8fc7\u5404\u79cd\u547d\u4ee4\u2f45\u6cd5\u6765\u64cd\u7eb5\u56fe\u50cf\u4e2d\u7684\u6bcf\u2f00\u4e2a\u90e8\u5206\uff0c\\n\u4ece\u2f7d\u8fbe\u5230\u6570\u636e\u53ef\u89c6\u5316\u7684\u6700\u7ec8\u6548\u679c\uff0c\u2f00\u526f\u5b8c\u6574\u7684\u56fe\u50cf\u5b9e\u9645\u4e0a\u662f\u5404\u7c7b\u2f26\u5143\u7d20\u7684\u96c6\u5408\u3002\\nFigure\uff1a\u9876\u5c42\u7ea7\uff0c\u2f64\u6765\u5bb9\u7eb3\u6240\u6709\u7ed8\u56fe\u5143\u7d20\\uf03a Contents \\n\u2f00\u3001\u8ba4\u8bc6matplotlib\\n\u2f06\u3001\u2f00\u4e2a\u6700\u7b80\u5355\u7684\u7ed8\u56fe\u4f8b\u2f26\\n\u4e09\u3001Figure \u7684\u7ec4\u6210', metadata={'source': 'docs/matplotlib/\u7b2c\u4e00\u56de\uff1aMatplotlib\u521d\u76f8\u8bc6.pdf', 'page': 0})"]}, "execution_count": 54, "metadata": {}, "output_type": "execute_result"}], "source": ["docs_chinese[1]"]}, {"cell_type": "markdown", "id": "3a3a915d", "metadata": {}, "source": ["\u6211\u4eec\u53ef\u4ee5\u770b\u5230\u4e00\u79cd\u65b0\u7684\u5931\u8d25\u7684\u60c5\u51b5\u3002\n", "\n", "\u4e0b\u9762\u7684\u95ee\u9898\u8be2\u95ee\u4e86\u5173\u4e8e\u7b2c\u4e09\u8bb2\u7684\u95ee\u9898\uff0c\u4f46\u4e5f\u5305\u62ec\u4e86\u6765\u81ea\u5176\u4ed6\u8bb2\u7684\u7ed3\u679c\u3002"]}, {"cell_type": "code", "execution_count": 40, "id": "b19135e5", "metadata": {"height": 45, "tags": []}, "outputs": [], "source": ["question = \"what did they say about regression in the third lecture?\" # \"\u4ed6\u4eec\u5728\u7b2c\u4e09\u8bb2\u4e2d\u662f\u600e\u4e48\u8c08\u8bba\u56de\u5f52\u7684\uff1f\""]}, {"cell_type": "code", "execution_count": 41, "id": "d434942c", "metadata": {"height": 30, "tags": []}, "outputs": [], "source": ["docs = vectordb.similarity_search(question,k=5)"]}, {"cell_type": "code", "execution_count": 42, "id": "c2c5df59", "metadata": {"height": 47, "tags": []}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 0}\n", "{'source': 'docs/cs229_lectures/MachineLearning-Lecture02.pdf', 'page': 2}\n", "{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 14}\n", "{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 4}\n", "{'source': 'docs/cs229_lectures/MachineLearning-Lecture02.pdf', 'page': 0}\n"]}], "source": ["for doc in docs:\n", " print(doc.metadata)"]}, {"cell_type": "code", "execution_count": 43, "id": "8e6cb50c", "metadata": {"height": 30, "tags": []}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["really makes a difference between a good so lution and amazing solution. And to give \n", "everyone to just how we do points assignments, or what is it that causes a solution to get \n", "full marks, or just how to write amazing so lutions. Becoming a grad er is usually a good \n", "way to do that. \n", "Graders are paid positions and you also get free food, and it's usually fun for us to sort of \n", "hang out for an evening and grade all the a ssignments. Okay, so I will send email. So \n", "don't email me yet if you want to be a grader. I'll send email to the entire class later with \n", "the administrative details and to solicit app lications. So you can email us back then, to \n", "apply, if you'd be interested in being a grader. \n", "Okay, any questions about that? All right, okay, so let's get started with today's material. \n", "So welcome back to the second lecture. What I want to do today is talk about linear \n", "regression, gradient descent, and the norma l equations. And I should also say, lecture \n", "notes have been posted online and so if some of the math I go over today, I go over rather \n", "quickly, if you want to see every equation wr itten out and work through the details more \n", "slowly yourself, go to the course homepage and download detailed lecture notes that \n", "pretty much describe all the mathematical, te chnical contents I'm going to go over today. \n", "Today, I'm also going to delve into a fair amount \u2013 some amount of linear algebra, and so\n"]}], "source": ["print(docs[4].page_content)"]}, {"cell_type": "code", "execution_count": 74, "id": "e0a7e9c8", "metadata": {}, "outputs": [], "source": ["question_chinese = \"\u4ed6\u4eec\u5728\u7b2c\u4e8c\u8bb2\u4e2d\u5bf9Figure\u8bf4\u4e86\u4e9b\u4ec0\u4e48\uff1f\" "]}, {"cell_type": "code", "execution_count": 75, "id": "62c0bfc6", "metadata": {}, "outputs": [], "source": ["docs_chinese = vectordb_chinese.similarity_search(question_chinese,k=5)"]}, {"cell_type": "code", "execution_count": 76, "id": "26bd586d", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["{'source': 'docs/matplotlib/\u7b2c\u4e00\u56de\uff1aMatplotlib\u521d\u76f8\u8bc6.pdf', 'page': 0}\n", "{'source': 'docs/matplotlib/\u7b2c\u4e00\u56de\uff1aMatplotlib\u521d\u76f8\u8bc6.pdf', 'page': 0}\n", "{'source': 'docs/matplotlib/\u7b2c\u4e8c\u56de\uff1a\u827a\u672f\u753b\u7b14\u89c1\u4e7e\u5764.pdf', 'page': 9}\n", "{'source': 'docs/matplotlib/\u7b2c\u4e8c\u56de\uff1a\u827a\u672f\u753b\u7b14\u89c1\u4e7e\u5764.pdf', 'page': 10}\n", "{'source': 'docs/matplotlib/\u7b2c\u4e00\u56de\uff1aMatplotlib\u521d\u76f8\u8bc6.pdf', 'page': 1}\n"]}], "source": ["for doc_chinese in docs_chinese:\n", " print(doc_chinese.metadata)"]}, {"cell_type": "code", "execution_count": 77, "id": "71b6bbf3", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["\u4e09\u3001\u5bf9\u8c61\u5bb9\u5668 - Object container\n", "\u5bb9\u5668\u4f1a\u5305\u542b\u2f00\u4e9b primitives\uff0c\u5e76\u4e14\u5bb9\u5668\u8fd8\u6709\u5b83\u2f83\u8eab\u7684\u5c5e\u6027\u3002\n", "\u2f50\u5982Axes Artist\uff0c\u5b83\u662f\u2f00\u79cd\u5bb9\u5668\uff0c\u5b83\u5305\u542b\u4e86\u5f88\u591a primitives\uff0c\u2f50\u5982Line2D\uff0cText\uff1b\u540c\u65f6\uff0c\u5b83\u4e5f\u6709\u2f83\u8eab\u7684\u5c5e\u6027\uff0c\u2f50\u5982 xscal\uff0c\u2f64\u6765\u63a7\u5236\n", "X\u8f74\u662flinear\u8fd8\u662flog\u7684\u3002\n", "1. Figure\u5bb9\u5668\n", "matplotlib.figure.Figure\u662fArtist\u6700\u9876\u5c42\u7684 container\u5bf9\u8c61\u5bb9\u5668\uff0c\u5b83\u5305\u542b\u4e86\u56fe\u8868\u4e2d\u7684\u6240\u6709\u5143\u7d20\u3002\u2f00\u5f20\u56fe\u8868\u7684\u80cc\u666f\u5c31\u662f\u5728\n", "Figure.patch\u7684\u2f00\u4e2a\u77e9\u5f62 Rectangle\u3002\n", "\u5f53\u6211\u4eec\u5411\u56fe\u8868\u6dfb\u52a0 Figure.add_subplot()\u6216\u8005Figure.add_axes()\u5143\u7d20\u65f6\uff0c\u8fd9\u4e9b\u90fd\u4f1a\u88ab\u6dfb\u52a0\u5230 Figure.axes\u5217\u8868\u4e2d\u3002\n", "fig = plt.figure()\n", "ax1 = fig.add_subplot(211) # \u4f5c\u2f00\u5e452*1 \u7684\u56fe\uff0c\u9009\u62e9\u7b2c 1 \u4e2a\u2f26\u56fe\n", "ax2 = fig.add_axes([0.1, 0.1, 0.7, 0.3]) # \u4f4d\u7f6e\u53c2\u6570\uff0c\u56db\u4e2a\u6570\u5206\u522b\u4ee3\u8868\u4e86\n", "(left,bottom,width,height)\n", "print(ax1) \n", "print(fig.axes) # fig.axes \u4e2d\u5305\u542b\u4e86 subplot \u548c axes \u4e24\u4e2a\u5b9e\u4f8b , \u521a\u521a\u6dfb\u52a0\u7684\n", "AxesSubplot(0.125,0.536818;0.775x0.343182)\n", "[, ]\n", "\u7531\u4e8eFigure\u7ef4\u6301\u4e86current axes\uff0c\u56e0\u6b64\u4f60\u4e0d\u5e94\u8be5\u2f3f\u52a8\u7684\u4ece Figure.axes\u5217\u8868\u4e2d\u6dfb\u52a0\u5220\u9664\u5143\u7d20\uff0c\u2f7d\u662f\u8981\u901a\u8fc7 Figure.add_subplot()\u3001\n", "Figure.add_axes()\u6765\u6dfb\u52a0\u5143\u7d20\uff0c\u901a\u8fc7 Figure.delaxes()\u6765\u5220\u9664\u5143\u7d20\u3002\u4f46\u662f\u4f60\u53ef\u4ee5\u8fed\u4ee3\u6216\u8005\u8bbf\u95ee Figure.axes\u4e2d\u7684Axes\uff0c\u7136\u540e\u4fee\u6539\u8fd9\u4e2a\n", "Axes\u7684\u5c5e\u6027\u3002\n", "\u2f50\u5982\u4e0b\u2faf\u7684\u904d\u5386 axes \u2fa5\u7684\u5185\u5bb9\uff0c\u5e76\u4e14\u6dfb\u52a0\u2f79\u683c\u7ebf\uff1a\n", "fig = plt.figure()\n", "ax1 = fig.add_subplot(211)\n", "for ax in fig.axes:\n", " ax.grid(True)\n", "Figure\u4e5f\u6709\u5b83\u2f83\u2f30\u7684 text\u3001line \u3001 patch \u3001 image\u3002\u4f60\u53ef\u4ee5\u76f4\u63a5\u901a\u8fc7 add primitive\u8bed\u53e5\u76f4\u63a5\u6dfb\u52a0\u3002\u4f46\u662f\u6ce8\u610f Figure\u9ed8\u8ba4\u7684\u5750\u6807\u7cfb\u662f\u4ee5\u50cf\n", "\u7d20\u4e3a\u5355\u4f4d\uff0c\u4f60\u53ef\u80fd\u9700\u8981\u8f6c\u6362\u6210 figure \u5750\u6807\u7cfb\uff1a (0,0) \u8868\u793a\u5de6\u4e0b\u70b9\uff0c (1,1) \u8868\u793a\u53f3\u4e0a\u70b9\u3002\n", "Figure\u5bb9\u5668\u7684\u5e38\u2ec5\u5c5e\u6027\uff1a\n", "Figure.patch\u5c5e\u6027\uff1aFigure \u7684\u80cc\u666f\u77e9\u5f62\n", "Figure.axes\u5c5e\u6027\uff1a\u2f00\u4e2a Axes \u5b9e\u4f8b\u7684\u5217\u8868\uff08\u5305\u62ec Subplot)\n", "Figure.images\u5c5e\u6027\uff1a\u2f00\u4e2a FigureImages patch \u5217\u8868\n", "Figure.lines\u5c5e\u6027\uff1a\u2f00\u4e2a Line2D \u5b9e\u4f8b\u7684\u5217\u8868\uff08\u5f88\u5c11\u4f7f\u2f64\uff09\n", "Figure.legends\u5c5e\u6027\uff1a\u2f00\u4e2a Figure Legend \u5b9e\u4f8b\u5217\u8868\uff08\u4e0d\u540c\u4e8e Axes.legends)\n", "Figure.texts\u5c5e\u6027\uff1a\u2f00\u4e2a Figure Text \u5b9e\u4f8b\u5217\u8868\n"]}], "source": ["print(docs_chinese[2].page_content)"]}, {"cell_type": "markdown", "id": "c3dbca56", "metadata": {}, "source": ["\u5728\u4e0b\u4e00\u8bb2\u4e2d\u8ba8\u8bba\u7684\u65b9\u6cd5\u53ef\u4ee5\u7528\u6765\u89e3\u51b3\u8fd9\u4e24\u4e2a\u95ee\u9898\uff01"]}], "metadata": {"kernelspec": {"display_name": "auto-gpt", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16"}}, "nbformat": 4, "nbformat_minor": 5}
\ No newline at end of file
diff --git "a/content/\345\277\205\344\277\256\345\233\233-LangChain Chat with Your Data/5.\346\243\200\347\264\242 retrieval.ipynb" "b/content/\345\277\205\344\277\256\345\233\233-LangChain Chat with Your Data/5.\346\243\200\347\264\242 retrieval.ipynb"
index 5aec2981..62200110 100644
--- "a/content/\345\277\205\344\277\256\345\233\233-LangChain Chat with Your Data/5.\346\243\200\347\264\242 retrieval.ipynb"
+++ "b/content/\345\277\205\344\277\256\345\233\233-LangChain Chat with Your Data/5.\346\243\200\347\264\242 retrieval.ipynb"
@@ -1 +1 @@
-{"cells":[{"attachments":{},"cell_type":"markdown","id":"0689733d","metadata":{},"source":["# 第五章 检索(Retrieval)\n","\n"," - [一、向量数据库检索](#一、向量数据库检索)\n"," - [1.1 相似性搜索](#1.1-相似性搜索)\n"," - [1.2 解决多样性:最大边际相关性(MMR)](#1.2-解决多样性:最大边际相关性(MMR))\n"," - [1.3 解决特殊性:使用元数据](#1.3-解决特殊性:使用元数据)\n"," - [1.4 解决特殊性:在元数据中使用自查询检索器](#1.4-解决特殊性:在元数据中使用自查询检索器)\n"," - [1.5 其他技巧:压缩](#1.5-其他技巧:压缩)\n"," - [二、结合各种技术](#二、结合各种技术)\n"," - [三、其他类型的检索](#三、其他类型的检索)\n"]},{"attachments":{},"cell_type":"markdown","id":"d12fbd74","metadata":{},"source":["\n","检索是我们的检索增强生成(RAG)流程的核心。\n","\n","让我们获得前面课程存储的向量数据库(`VectorDB`)。"]},{"attachments":{},"cell_type":"markdown","id":"ed2569c6","metadata":{},"source":["## 一、向量数据库检索"]},{"attachments":{},"cell_type":"markdown","id":"651a10db","metadata":{},"source":["在当前文件夹下新建`.env`文件,内容为`OPENAI_API_KEY = \"sk-...\"`\n","\n","本章节需要使用`lark`包,运行以下命令安装"]},{"cell_type":"code","execution_count":1,"id":"51b15e5c-9b92-4d40-a149-b56335d330d9","metadata":{"tags":[]},"outputs":[],"source":["import os\n","import openai\n","import sys\n","sys.path.append('../..')\n","\n","from dotenv import load_dotenv, find_dotenv\n","_ = load_dotenv(find_dotenv()) # read local .env file\n","\n","openai.api_key = os.environ['OPENAI_API_KEY']\n"]},{"attachments":{},"cell_type":"markdown","id":"c2d552e1","metadata":{},"source":["### 1.1 相似性搜索"]},{"cell_type":"code","execution_count":2,"id":"fe368042","metadata":{"tags":[]},"outputs":[],"source":["from langchain.vectorstores import Chroma\n","from langchain.embeddings.openai import OpenAIEmbeddings\n","persist_directory = 'docs/chroma/cs229_lectures/'\n","persist_directory_chinese = 'docs/chroma/matplotlib/'"]},{"attachments":{},"cell_type":"markdown","id":"5ba63e21","metadata":{},"source":["将上节课所保存的向量数据库(`VectorDB`)加载进来"]},{"cell_type":"code","execution_count":3,"id":"a0189dc5","metadata":{"tags":[]},"outputs":[{"name":"stderr","output_type":"stream","text":["/Users/lta/anaconda3/envs/chat_data/lib/python3.11/site-packages/langchain_core/_api/deprecation.py:119: LangChainDeprecationWarning: The class `OpenAIEmbeddings` was deprecated in LangChain 0.0.9 and will be removed in 0.3.0. An updated version of the class exists in the langchain-openai package and should be used instead. To use it run `pip install -U langchain-openai` and import as `from langchain_openai import OpenAIEmbeddings`.\n"," warn_deprecated(\n"]}],"source":["embedding = OpenAIEmbeddings(model='text-embedding-3-small')"]},{"cell_type":"code","execution_count":4,"id":"2be10170","metadata":{},"outputs":[],"source":["vectordb = Chroma(\n"," persist_directory=persist_directory,\n"," embedding_function=embedding\n",")"]},{"cell_type":"code","execution_count":5,"id":"3659e0f7","metadata":{"tags":[]},"outputs":[{"name":"stdout","output_type":"stream","text":["209\n"]}],"source":["print(vectordb._collection.count())"]},{"cell_type":"code","execution_count":6,"id":"a01ab000","metadata":{},"outputs":[],"source":["vectordb_chinese = Chroma(\n"," persist_directory=persist_directory_chinese,\n"," embedding_function=embedding\n",")"]},{"cell_type":"code","execution_count":7,"id":"a6998a03","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["27\n"]}],"source":["print(vectordb_chinese._collection.count())"]},{"attachments":{},"cell_type":"markdown","id":"9ae4fdd8","metadata":{},"source":["让我们现在来看看最大边际相关性的例子。因此,我们将从下面示例中加载有关蘑菇的信息。\n","\n","让我们现在运行它与MMR。让我们传入k等于2。我们仍然希望返回两个文档,但让我们设置获取k等于3,其中我们最初获取所有三个文档。我们现在可以看到,我们检索的文档中返回了有毒的信息。"]},{"cell_type":"code","execution_count":8,"id":"a807c758","metadata":{"tags":[]},"outputs":[],"source":["texts = [\n"," \"\"\"The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).\"\"\",\n"," \"\"\"A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.\"\"\",\n"," \"\"\"A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.\"\"\",\n","]"]},{"cell_type":"code","execution_count":9,"id":"b110cceb","metadata":{},"outputs":[],"source":["texts_chinese = [\n"," \"\"\"毒鹅膏菌(Amanita phalloides)具有大型且引人注目的地上(epigeous)子实体(basidiocarp)\"\"\",\n"," \"\"\"一种具有大型子实体的蘑菇是毒鹅膏菌(Amanita phalloides)。某些品种全白。\"\"\",\n"," \"\"\"A. phalloides,又名死亡帽,是已知所有蘑菇中最有毒的一种。\"\"\",\n","]"]},{"attachments":{},"cell_type":"markdown","id":"84cd5f1c","metadata":{},"source":["对于这个例子,我们将创建一个小数据库,我们可以作为一个示例来使用。"]},{"cell_type":"code","execution_count":10,"id":"715d54f3","metadata":{"tags":[]},"outputs":[],"source":["smalldb = Chroma.from_texts(texts, embedding=embedding)"]},{"cell_type":"code","execution_count":11,"id":"305e1714","metadata":{},"outputs":[],"source":["smalldb_chinese = Chroma.from_texts(texts_chinese, embedding=embedding)"]},{"attachments":{},"cell_type":"markdown","id":"239a8d95","metadata":{},"source":["下面是我们对于这个示例所提出的问题"]},{"cell_type":"code","execution_count":12,"id":"9a37b5a5","metadata":{"tags":[]},"outputs":[],"source":["question = \"Tell me about all-white mushrooms with large fruiting bodies\""]},{"cell_type":"code","execution_count":13,"id":"92312e57","metadata":{},"outputs":[],"source":["question_chinese = \"告诉我关于具有大型子实体的全白色蘑菇的信息\""]},{"attachments":{},"cell_type":"markdown","id":"d3224a6d","metadata":{},"source":["现在,我们可以运行一个相似性搜索,设置k=2,只返回两个最相关的文档。\n","\n","我们可以看到,没有提到它是有毒的事实。"]},{"cell_type":"code","execution_count":14,"id":"24e3b025","metadata":{"tags":[]},"outputs":[{"data":{"text/plain":["[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.'),\n"," Document(page_content='The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).')]"]},"execution_count":14,"metadata":{},"output_type":"execute_result"}],"source":["smalldb.similarity_search(question, k=2)"]},{"cell_type":"code","execution_count":15,"id":"d4c5a47d","metadata":{},"outputs":[{"data":{"text/plain":["[Document(page_content='一种具有大型子实体的蘑菇是毒鹅膏菌(Amanita phalloides)。某些品种全白。'),\n"," Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.')]"]},"execution_count":15,"metadata":{},"output_type":"execute_result"}],"source":["smalldb_chinese.similarity_search(question_chinese, k=2)"]},{"attachments":{},"cell_type":"markdown","id":"bbb0ea94","metadata":{},"source":["现在,让我们运行最大边际相关性(MMR)。\n","\n","设置k=2,因为我们仍然希望返回两个文档。设置fetch_k=3,fetch_k是我们最初获取的所有文档(3个)。"]},{"cell_type":"code","execution_count":16,"id":"4daa6c0d","metadata":{"tags":[]},"outputs":[{"data":{"text/plain":["[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.'),\n"," Document(page_content='一种具有大型子实体的蘑菇是毒鹅膏菌(Amanita phalloides)。某些品种全白。')]"]},"execution_count":16,"metadata":{},"output_type":"execute_result"}],"source":["smalldb.max_marginal_relevance_search(question,k=2, fetch_k=3)"]},{"cell_type":"code","execution_count":17,"id":"e15521d2","metadata":{},"outputs":[{"data":{"text/plain":["[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.'),\n"," Document(page_content='一种具有大型子实体的蘑菇是毒鹅膏菌(Amanita phalloides)。某些品种全白。')]"]},"execution_count":17,"metadata":{},"output_type":"execute_result"}],"source":["smalldb_chinese.max_marginal_relevance_search(question,k=2, fetch_k=3)"]},{"attachments":{},"cell_type":"markdown","id":"e87c5f91","metadata":{},"source":["我们现在可以看到,我们检索的文档中返回了有毒的信息。"]},{"attachments":{},"cell_type":"markdown","id":"5a29e8c9","metadata":{},"source":["### 1.2 解决多样性:最大边际相关性(MMR)"]},{"attachments":{},"cell_type":"markdown","id":"a2b5c4ae","metadata":{},"source":["\n","我们刚刚通过一个示例引出了一个问题:如何加强搜索结果的多样性。\n"," \n","最大边际相关性(`Maximum marginal relevance`)试图在查询的相关性和结果的多样性之间实现两全其美。"]},{"attachments":{},"cell_type":"markdown","id":"2360545c","metadata":{},"source":["让我们回到上一节课的一个例子,当我们通过问题对向量数据库进行相似性搜索后\n","\n","我们可以看看前两个文档,只看前几个字符,可以看到它们是相同的。"]},{"cell_type":"code","execution_count":18,"id":"9bb2c0a9","metadata":{"tags":[]},"outputs":[],"source":["question = \"what did they say about matlab?\"\n","docs_ss = vectordb.similarity_search(question,k=3)"]},{"cell_type":"code","execution_count":19,"id":"f07f8793","metadata":{"tags":[]},"outputs":[{"data":{"text/plain":["'those homeworks will be done in either MATLA B or in Octave, which is sort of — I \\nknow some people '"]},"execution_count":19,"metadata":{},"output_type":"execute_result"}],"source":["docs_ss[0].page_content[:100]"]},{"cell_type":"code","execution_count":20,"id":"e9f7e165","metadata":{"tags":[]},"outputs":[{"data":{"text/plain":["'those homeworks will be done in either MATLA B or in Octave, which is sort of — I \\nknow some people '"]},"execution_count":20,"metadata":{},"output_type":"execute_result"}],"source":["docs_ss[1].page_content[:100]"]},{"cell_type":"code","execution_count":21,"id":"e8e142eb","metadata":{},"outputs":[],"source":["question_chinese = \"Matplotlib是什么?\"\n","docs_ss_chinese = vectordb_chinese.similarity_search(question_chinese,k=3)"]},{"cell_type":"code","execution_count":22,"id":"cf642f66","metadata":{},"outputs":[{"data":{"text/plain":["'第⼀回:Matplotlib 初相识\\n⼀、认识matplotlib\\nMatplotlib 是⼀个 Python 2D 绘图库,能够以多种硬拷⻉格式和跨平台的交互式环境⽣成出版物质量的图形,⽤来绘制各种'"]},"execution_count":22,"metadata":{},"output_type":"execute_result"}],"source":["docs_ss_chinese[0].page_content[:100]"]},{"cell_type":"code","execution_count":23,"id":"1e9f5cfe","metadata":{},"outputs":[{"data":{"text/plain":["'第⼀回:Matplotlib 初相识\\n⼀、认识matplotlib\\nMatplotlib 是⼀个 Python 2D 绘图库,能够以多种硬拷⻉格式和跨平台的交互式环境⽣成出版物质量的图形,⽤来绘制各种'"]},"execution_count":23,"metadata":{},"output_type":"execute_result"}],"source":["docs_ss_chinese[1].page_content[:100]"]},{"attachments":{},"cell_type":"markdown","id":"4c4ca1b6","metadata":{},"source":["注意:使用`MMR`所得出结果的差异。"]},{"cell_type":"code","execution_count":24,"id":"222234c5","metadata":{"tags":[]},"outputs":[],"source":["docs_mmr = vectordb.max_marginal_relevance_search(question,k=3)"]},{"cell_type":"code","execution_count":25,"id":"408935bc","metadata":{},"outputs":[],"source":["docs_mmr_chinese = vectordb_chinese.max_marginal_relevance_search(question_chinese,k=3)"]},{"attachments":{},"cell_type":"markdown","id":"9076db81","metadata":{},"source":["当我们运行MMR后得到结果时,我们可以看到第一个与之前的相同,因为那是最相似的。"]},{"cell_type":"code","execution_count":26,"id":"93b20226","metadata":{"tags":[]},"outputs":[{"data":{"text/plain":["'those homeworks will be done in either MATLA B or in Octave, which is sort of — I \\nknow some people '"]},"execution_count":26,"metadata":{},"output_type":"execute_result"}],"source":["docs_mmr[0].page_content[:100]"]},{"cell_type":"code","execution_count":27,"id":"d0acfaab","metadata":{},"outputs":[{"data":{"text/plain":["'第⼀回:Matplotlib 初相识\\n⼀、认识matplotlib\\nMatplotlib 是⼀个 Python 2D 绘图库,能够以多种硬拷⻉格式和跨平台的交互式环境⽣成出版物质量的图形,⽤来绘制各种'"]},"execution_count":27,"metadata":{},"output_type":"execute_result"}],"source":["docs_mmr_chinese[0].page_content[:100]"]},{"attachments":{},"cell_type":"markdown","id":"7a93743f","metadata":{},"source":["但是当我们进行到第二个时,我们可以看到它是不同的。\n","\n","它在回应中获得了一些多样性。"]},{"cell_type":"code","execution_count":28,"id":"17d39762","metadata":{"tags":[]},"outputs":[{"data":{"text/plain":["\"mathematical work, he feels like he's disc overing truth and beauty in the universe. And \\nhe says it\""]},"execution_count":28,"metadata":{},"output_type":"execute_result"}],"source":["docs_mmr[1].page_content[:100]"]},{"cell_type":"code","execution_count":29,"id":"93d3206c","metadata":{},"outputs":[{"data":{"text/plain":["'fig, ax = plt.subplots(figsize=(4,3))\\nax.grid(True)\\n使⽤ set_xscale 可以设置坐标轴的规度(指对数坐标等)\\nfig, axs = plt.'"]},"execution_count":29,"metadata":{},"output_type":"execute_result"}],"source":["docs_mmr_chinese[1].page_content[:100]"]},{"attachments":{},"cell_type":"markdown","id":"b2b909bc","metadata":{},"source":["### 1.3 解决特殊性:使用元数据"]},{"attachments":{},"cell_type":"markdown","id":"7b63c5ee","metadata":{},"source":["\n","在上一节课中,我们展示了一个问题,是询问了关于文档中某一讲的问题,但得到的结果中也包括了来自其他讲的结果。\n","\n","为了解决这一问题,很多向量数据库都支持对`metadata`的操作。\n","\n","`metadata`为每个嵌入的块(embedded chunk)提供上下文。"]},{"cell_type":"code","execution_count":30,"id":"3c1a60b2","metadata":{"tags":[]},"outputs":[],"source":["question = \"what did they say about regression in the third lecture?\""]},{"cell_type":"code","execution_count":31,"id":"ba98df3c","metadata":{},"outputs":[],"source":["question_chinese = \"他们在第二讲中对Figure说了些什么?\" "]},{"attachments":{},"cell_type":"markdown","id":"3873525e","metadata":{},"source":["现在,我们以手动的方式来解决这个问题,我们会指定一个元数据过滤器`filter`"]},{"cell_type":"code","execution_count":32,"id":"a8612840","metadata":{"tags":[]},"outputs":[],"source":["docs = vectordb.similarity_search(\n"," question,\n"," k=3,\n"," filter={\"source\":\"docs/cs229_lectures/MachineLearning-Lecture03.pdf\"}\n",")"]},{"cell_type":"code","execution_count":33,"id":"b46c7e76","metadata":{},"outputs":[],"source":["docs_chinese = vectordb_chinese.similarity_search(\n"," question_chinese,\n"," k=3,\n"," filter={\"source\":\"docs/matplotlib/第二回:艺术画笔见乾坤.pdf\"}\n",")"]},{"attachments":{},"cell_type":"markdown","id":"869aee28","metadata":{},"source":["接下来,我们可以看到结果都来自对应的章节"]},{"cell_type":"code","execution_count":34,"id":"97031876","metadata":{"tags":[]},"outputs":[{"name":"stdout","output_type":"stream","text":["{'page': 0, 'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf'}\n","{'page': 14, 'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf'}\n","{'page': 6, 'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf'}\n"]}],"source":["for d in docs:\n"," print(d.metadata)"]},{"cell_type":"code","execution_count":35,"id":"2708f6ae","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["{'page': 9, 'source': 'docs/matplotlib/第二回:艺术画笔见乾坤.pdf'}\n","{'page': 0, 'source': 'docs/matplotlib/第二回:艺术画笔见乾坤.pdf'}\n","{'page': 3, 'source': 'docs/matplotlib/第二回:艺术画笔见乾坤.pdf'}\n"]}],"source":["for d in docs_chinese:\n"," print(d.metadata)\n"," "]},{"attachments":{},"cell_type":"markdown","id":"5e299f8e","metadata":{},"source":["当然,我们不能每次都采用手动的方式来解决这个问题,这会显得不够智能\n","\n","下一小节将要展示通过LLM来解决这个问题"]},{"attachments":{},"cell_type":"markdown","id":"ccc2d784","metadata":{},"source":["### 1.4 解决特殊性:在元数据中使用自查询检索器"]},{"attachments":{},"cell_type":"markdown","id":"82ef44b6","metadata":{},"source":["我们有一个有趣的挑战:我们通常希望从查询本身来推断元数据。\n","\n","为了解决这个问题,我们可以使用SelfQueryRetriever,它使用LLM来提取:\n"," \n","1. 用于向量搜索的查询(`query`)字符串,即:问题\n","2. 要一起传入的元数据过滤器\n","\n","大多数向量数据库支持元数据过滤器,因此不需要任何新的数据库及索引。"]},{"cell_type":"code","execution_count":36,"id":"b1d06084","metadata":{"tags":[]},"outputs":[],"source":["from langchain_community.llms.openai import OpenAI\n","from langchain.retrievers.self_query.base import SelfQueryRetriever\n","from langchain.chains.query_constructor.base import AttributeInfo"]},{"cell_type":"code","execution_count":37,"id":"869c27c0","metadata":{},"outputs":[{"name":"stderr","output_type":"stream","text":["/Users/lta/anaconda3/envs/chat_data/lib/python3.11/site-packages/langchain_core/_api/deprecation.py:119: LangChainDeprecationWarning: The class `OpenAI` was deprecated in LangChain 0.0.10 and will be removed in 0.3.0. An updated version of the class exists in the langchain-openai package and should be used instead. To use it run `pip install -U langchain-openai` and import as `from langchain_openai import OpenAI`.\n"," warn_deprecated(\n"]}],"source":["llm = OpenAI(temperature=0)"]},{"attachments":{},"cell_type":"markdown","id":"acd194c5","metadata":{},"source":["`AttributeInfo`是我们可以指定元数据中的不同字段以及它们对应的位置。\n","\n","在元数据中,我们只有两个字段,`source`和`page`。\n","\n","我们将填写每个属性的名称、描述和类型的描述。\n","\n","这些信息实际上将被传递给LLM,所以需要尽可能详细地描述。"]},{"cell_type":"code","execution_count":38,"id":"0aa5e698","metadata":{"tags":[]},"outputs":[],"source":["metadata_field_info = [\n"," AttributeInfo(\n"," name=\"source\",\n"," description=\"The lecture the chunk is from, should be one of `docs/cs229_lectures/MachineLearning-Lecture01.pdf`, `docs/cs229_lectures/MachineLearning-Lecture02.pdf`, or `docs/cs229_lectures/MachineLearning-Lecture03.pdf`\",\n"," type=\"string\",\n"," ),\n"," AttributeInfo(\n"," name=\"page\",\n"," description=\"The page from the lecture\",\n"," type=\"integer\",\n"," ),\n","]"]},{"cell_type":"code","execution_count":39,"id":"544ad7c1","metadata":{},"outputs":[],"source":["metadata_field_info_chinese = [\n"," AttributeInfo(\n"," name=\"source\",\n"," description=\"讲义来源于 `docs/matplotlib/第一回:Matplotlib初相识.pdf`, `docs/matplotlib/第二回:艺术画笔见乾坤.pdf`, or `docs/matplotlib/第三回:布局格式定方圆.pdf` 的其中之一\",\n"," type=\"string\",\n"," ),\n"," AttributeInfo(\n"," name=\"page\",\n"," description=\"讲义的那一页\",\n"," type=\"integer\",\n"," ),\n","]"]},{"cell_type":"code","execution_count":40,"id":"e7906c15","metadata":{"tags":[]},"outputs":[],"source":["document_content_description = \"Lecture notes\"\n","retriever = SelfQueryRetriever.from_llm(\n"," llm,\n"," vectordb,\n"," document_content_description,\n"," metadata_field_info,\n"," verbose=True\n",")"]},{"cell_type":"code","execution_count":41,"id":"d5b99571","metadata":{},"outputs":[],"source":["document_content_description_chinese = \"课堂讲义\"\n","retriever_chinese = SelfQueryRetriever.from_llm(\n"," llm,\n"," vectordb_chinese,\n"," document_content_description_chinese,\n"," metadata_field_info_chinese,\n"," verbose=True\n",")"]},{"cell_type":"code","execution_count":42,"id":"79d781b9","metadata":{"tags":[]},"outputs":[],"source":["question = \"what did they say about regression in the third lecture?\""]},{"cell_type":"code","execution_count":43,"id":"8d9b7e18","metadata":{},"outputs":[],"source":["question_chinese = \"他们在第二讲中对Figure说了些什么?\" "]},{"cell_type":"code","execution_count":44,"id":"1d4f9f7d","metadata":{"tags":[]},"outputs":[{"name":"stderr","output_type":"stream","text":["/Users/lta/anaconda3/envs/chat_data/lib/python3.11/site-packages/langchain_core/_api/deprecation.py:119: LangChainDeprecationWarning: The method `BaseRetriever.get_relevant_documents` was deprecated in langchain-core 0.1.46 and will be removed in 0.3.0. Use invoke instead.\n"," warn_deprecated(\n"]}],"source":["docs = retriever.get_relevant_documents(question)"]},{"cell_type":"code","execution_count":45,"id":"ea39a97e","metadata":{},"outputs":[],"source":["docs_chinese = retriever_chinese.get_relevant_documents(question_chinese)"]},{"cell_type":"code","execution_count":46,"id":"db04374e","metadata":{"tags":[]},"outputs":[{"name":"stdout","output_type":"stream","text":["{'page': 10, 'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf'}\n","{'page': 11, 'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf'}\n","{'page': 6, 'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf'}\n","{'page': 2, 'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf'}\n"]}],"source":["for d in docs:\n"," print(d.metadata)"]},{"cell_type":"code","execution_count":47,"id":"143061f5","metadata":{},"outputs":[],"source":["for d in docs_chinese:\n"," print(d.metadata)"]},{"attachments":{},"cell_type":"markdown","id":"297b8168","metadata":{},"source":["### 1.5 其他技巧:压缩"]},{"attachments":{},"cell_type":"markdown","id":"564144da","metadata":{},"source":["另一种提高检索到的文档质量的方法是压缩。\n","\n","与查询最相关的信息可能隐藏在具有大量不相关文本的文档中。\n","\n","在应用程序中传递完整的文档可能会导致更昂贵的LLM调用和更差的响应。\n","\n","上下文压缩就是为了解决这个问题。"]},{"cell_type":"code","execution_count":48,"id":"a060cf74","metadata":{"tags":[]},"outputs":[],"source":["from langchain.retrievers import ContextualCompressionRetriever\n","from langchain.retrievers.document_compressors import LLMChainExtractor"]},{"cell_type":"code","execution_count":49,"id":"038649c8","metadata":{"tags":[]},"outputs":[],"source":["def pretty_print_docs(docs):\n"," print(f\"\\n{'-' * 100}\\n\".join([f\"Document {i+1}:\\n\\n\" + d.page_content for i, d in enumerate(docs)]))"]},{"cell_type":"code","execution_count":50,"id":"fc686cf2","metadata":{"tags":[]},"outputs":[],"source":["llm = OpenAI(temperature=0)\n","compressor = LLMChainExtractor.from_llm(llm) # 压缩器"]},{"cell_type":"code","execution_count":51,"id":"82794397","metadata":{"tags":[]},"outputs":[],"source":["compression_retriever = ContextualCompressionRetriever(\n"," base_compressor=compressor,\n"," base_retriever=vectordb.as_retriever()\n",")"]},{"cell_type":"code","execution_count":52,"id":"915598f8","metadata":{},"outputs":[],"source":["compression_retriever_chinese = ContextualCompressionRetriever(\n"," base_compressor=compressor,\n"," base_retriever=vectordb_chinese.as_retriever()\n",")"]},{"cell_type":"code","execution_count":53,"id":"cde86848","metadata":{"tags":[]},"outputs":[{"name":"stdout","output_type":"stream","text":["Document 1:\n","\n","- \"those homeworks will be done in either MATLA B or in Octave\"\n","- \"I know some people call it a free ve rsion of MATLAB\"\n","- \"MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to plot data.\"\n","- \"there's also a software package called Octave that you can download for free off the Internet.\"\n","- \"it has somewhat fewer features than MATLAB, but it's free, and for the purposes of this class, it will work for just about everything.\"\n","- \"once a colleague of mine at a different university, not at Stanford, actually teaches another machine learning course.\"\n","----------------------------------------------------------------------------------------------------\n","Document 2:\n","\n","- \"those homeworks will be done in either MATLA B or in Octave\"\n","- \"I know some people call it a free ve rsion of MATLAB\"\n","- \"MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to plot data.\"\n","- \"there's also a software package called Octave that you can download for free off the Internet.\"\n","- \"it has somewhat fewer features than MATLAB, but it's free, and for the purposes of this class, it will work for just about everything.\"\n","- \"once a colleague of mine at a different university, not at Stanford, actually teaches another machine learning course.\"\n","----------------------------------------------------------------------------------------------------\n","Document 3:\n","\n","\"Oh, it was the MATLAB.\"\n","----------------------------------------------------------------------------------------------------\n","Document 4:\n","\n","\"Oh, it was the MATLAB.\"\n"]}],"source":["question = \"what did they say about matlab?\"\n","compressed_docs = compression_retriever.get_relevant_documents(question)\n","pretty_print_docs(compressed_docs)"]},{"cell_type":"code","execution_count":54,"id":"39726b24","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Document 1:\n","\n","Matplotlib 是⼀个 Python 2D 绘图库,能够以多种硬拷⻉格式和跨平台的交互式环境⽣成出版物质量的图形,⽤来绘制各种静态,动态,交互式的图表。\n","----------------------------------------------------------------------------------------------------\n","Document 2:\n","\n","Matplotlib 是⼀个 Python 2D 绘图库,能够以多种硬拷⻉格式和跨平台的交互式环境⽣成出版物质量的图形,⽤来绘制各种静态,动态,交互式的图表。\n","----------------------------------------------------------------------------------------------------\n","Document 3:\n","\n","Axes:matplotlib 宇宙的核⼼,容纳了⼤量元素⽤来构造⼀幅幅⼦图,⼀个 figure 可以由⼀个或多个⼦图组成\n","Axis:axes的下属层级,⽤于处理所有和坐标轴,⽹格有关的元素\n","Tick:axis的下属层级,⽤来处理所有和刻度有关的元素\n","四、两种绘图接⼝\n","matplotlib 提供了两种最常⽤的绘图接⼝\n","----------------------------------------------------------------------------------------------------\n","Document 4:\n","\n","Axes:matplotlib 宇宙的核⼼,容纳了⼤量元素⽤来构造⼀幅幅⼦图,⼀个 figure 可以由⼀个或多个⼦图组成\n","Axis:axes的下属层级,⽤于处理所有和坐标轴,⽹格有关的元素\n","Tick:axis的下属层级,⽤来处理所有和刻度有关的元素\n","四、两种绘图接⼝\n","matplotlib 提供了两种最常⽤的绘图接⼝\n"]}],"source":["question_chinese = \"Matplotlib是什么?\"\n","compressed_docs_chinese = compression_retriever_chinese.get_relevant_documents(question_chinese)\n","pretty_print_docs(compressed_docs_chinese)"]},{"attachments":{},"cell_type":"markdown","id":"049b2601","metadata":{},"source":["现在当我们提出问题后,查看结果文档\n","\n","我们可以看到两件事。\n","\n","1. 它们比正常文档短很多\n","2. 仍然有一些重复的东西,这是因为在底层我们使用的是语义搜索算法。\n","\n","这就是我们在本课程前面使用MMR解决的问题。\n","\n","这是一个很好的例子,你可以结合各种技术得到最好的可能结果。"]},{"attachments":{},"cell_type":"markdown","id":"82c4fc4d","metadata":{},"source":["## 二、结合各种技术"]},{"attachments":{},"cell_type":"markdown","id":"54432975","metadata":{},"source":["为了做到这一点,我们在从向量数据库创建检索器时,可以将搜索类型设置为MMR。\n","\n","然后我们可以重新运行这个过程,看到我们返回的是一个过滤过的结果集,其中不包含任何重复的信息。"]},{"cell_type":"code","execution_count":55,"id":"161ae1ad","metadata":{"tags":[]},"outputs":[],"source":["compression_retriever = ContextualCompressionRetriever(\n"," base_compressor=compressor,\n"," base_retriever=vectordb.as_retriever(search_type = \"mmr\")\n",")"]},{"cell_type":"code","execution_count":56,"id":"cd6396bb","metadata":{},"outputs":[],"source":["compression_retriever_chinese = ContextualCompressionRetriever(\n"," base_compressor=compressor,\n"," base_retriever=vectordb_chinese.as_retriever(search_type = \"mmr\")\n",")"]},{"cell_type":"code","execution_count":57,"id":"e77ccae1","metadata":{"tags":[]},"outputs":[{"name":"stdout","output_type":"stream","text":["Document 1:\n","\n","- \"those homeworks will be done in either MATLA B or in Octave\"\n","- \"I know some people call it a free ve rsion of MATLAB\"\n","- \"MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to plot data.\"\n","- \"there's also a software package called Octave that you can download for free off the Internet.\"\n","- \"it has somewhat fewer features than MATLAB, but it's free, and for the purposes of this class, it will work for just about everything.\"\n","- \"once a colleague of mine at a different university, not at Stanford, actually teaches another machine learning course.\"\n","----------------------------------------------------------------------------------------------------\n","Document 2:\n","\n","\"Oh, it was the MATLAB.\"\n"]}],"source":["question = \"what did they say about matlab?\"\n","compressed_docs = compression_retriever.get_relevant_documents(question)\n","pretty_print_docs(compressed_docs)"]},{"cell_type":"code","execution_count":58,"id":"fe68a14b","metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["Document 1:\n","\n","Matplotlib 是⼀个 Python 2D 绘图库,能够以多种硬拷⻉格式和跨平台的交互式环境⽣成出版物质量的图形,⽤来绘制各种静态,动态,交互式的图表。\n","----------------------------------------------------------------------------------------------------\n","Document 2:\n","\n","Matplotlib是什么?\n","fig, ax = plt.subplots(figsize=(4,3))\n","ax.grid(True)\n","使⽤ set_xscale 可以设置坐标轴的规度(指对数坐标等)\n","fig, axs = plt.subplots(1, 2, figsize=(10, 4))\n","for j in range(2):\n"," axs[j].plot(list('abcd'), [10**i for i in range(4)])\n"," if j==0:\n"," axs[j].set_yscale('log')\n"," else:\n"," pass\n","fig.tight_layout()\n","----------------------------------------------------------------------------------------------------\n","Document 3:\n","\n","matplotlib.pyplot.pie(x, explode=None, labels=None, colors=None, autopct=None, pctdistance=0.6, shadow=False,\n","labeldistance=1.1, startangle=0, radius=1, counterclock=True, wedgeprops=None, textprops=None, center=0, 0,\n","frame=False, rotatelabels=False, *, normalize=None, data=None)\n","制作数据x 的饼图,每个楔⼦的⾯积⽤ x/sum(x) 表示。\n","其中最主要的参数是前 4 个:\n","x:楔型的形状,⼀维数组。\n","explode:如果不是等于 None ,则是⼀个 len(x) 数组,它指定⽤于偏移每个楔形块的半径的分数。\n","labels:⽤于指定每个楔型块的标记,取值是列表或为 None 。\n","colors:饼图循环使⽤的颜⾊序列。如果取值为 None ,将使⽤当前\n","----------------------------------------------------------------------------------------------------\n","Document 4:\n","\n","Matplotlib是什么?\n"]}],"source":["question_chinese = \"Matplotlib是什么?\"\n","compressed_docs_chinese = compression_retriever_chinese.get_relevant_documents(question_chinese)\n","pretty_print_docs(compressed_docs_chinese)"]},{"attachments":{},"cell_type":"markdown","id":"6c2b63e1","metadata":{},"source":["## 三、其他类型的检索"]},{"attachments":{},"cell_type":"markdown","id":"3e777a7b","metadata":{},"source":["值得注意的是,vetordb并不是唯一一种检索文档的工具。\n","\n","`LangChain`检索器抽象包括其他检索文档的方式,如:`TF-IDF` 或 `SVM`。"]},{"cell_type":"code","execution_count":59,"id":"83d2e808","metadata":{"tags":[]},"outputs":[],"source":["from langchain.retrievers import SVMRetriever\n","from langchain.retrievers import TFIDFRetriever\n","from langchain.document_loaders import PyPDFLoader\n","from langchain.text_splitter import RecursiveCharacterTextSplitter"]},{"cell_type":"code","execution_count":60,"id":"bcf5b760","metadata":{"tags":[]},"outputs":[],"source":["# 加载PDF\n","loader = PyPDFLoader(\"docs/cs229_lectures/MachineLearning-Lecture01.pdf\")\n","pages = loader.load()\n","all_page_text = [p.page_content for p in pages]\n","joined_page_text = \" \".join(all_page_text)\n","\n","# 分割文本\n","text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500,chunk_overlap = 150)\n","splits = text_splitter.split_text(joined_page_text)\n"]},{"cell_type":"code","execution_count":61,"id":"1657e768","metadata":{},"outputs":[],"source":["# 加载PDF\n","loader_chinese = PyPDFLoader(\"docs/matplotlib/第一回:Matplotlib初相识.pdf\")\n","pages_chinese = loader_chinese.load()\n","all_page_text_chinese = [p.page_content for p in pages_chinese]\n","joined_page_text_chinese = \" \".join(all_page_text_chinese)\n","\n","# 分割文本\n","text_splitter_chinese = RecursiveCharacterTextSplitter(chunk_size = 1500,chunk_overlap = 150)\n","splits_chinese = text_splitter_chinese.split_text(joined_page_text_chinese)"]},{"cell_type":"code","execution_count":62,"id":"59abbaff","metadata":{},"outputs":[],"source":["# 检索\n","svm_retriever = SVMRetriever.from_texts(splits, embedding)\n","tfidf_retriever = TFIDFRetriever.from_texts(splits)"]},{"cell_type":"code","execution_count":63,"id":"7885389e","metadata":{},"outputs":[{"data":{"text/plain":["Document(page_content=\"or find project partners or discuss homework problems and so on, and it's not monitored \\nby the TAs and me. So feel free to ta lk trash about this class there. \\nIf you want to contact the teaching staff, pl ease use the email address written down here, \\ncs229-qa@cs.stanford.edu. This goes to an acc ount that's read by all the TAs and me. So \\nrather than sending us email individually, if you send email to this account, it will \\nactually let us get back to you maximally quickly with answers to your questions. \\nIf you're asking questions about homework probl ems, please say in the subject line which \\nassignment and which question the email refers to, since that will also help us to route \\nyour question to the appropriate TA or to me appropriately and get the response back to \\nyou quickly. \\nLet's see. Skipping ahead — let's see — for homework, one midterm, one open and term \\nproject. Notice on the honor code. So one thi ng that I think will help you to succeed and \\ndo well in this class and even help you to enjoy this cla ss more is if you form a study \\ngroup. \\nSo start looking around where you' re sitting now or at the end of class today, mingle a \\nlittle bit and get to know your classmates. I strongly encourage you to form study groups \\nand sort of have a group of people to study with and have a group of your fellow students \\nto talk over these concepts with. You can also post on the class news group if you want to \\nuse that to try to form a study group.\")"]},"execution_count":63,"metadata":{},"output_type":"execute_result"}],"source":["question = \"What are major topics for this class?\" # 这门课的主要主题是什么?\n","docs_svm = svm_retriever.get_relevant_documents(question)\n","docs_svm[0]"]},{"cell_type":"code","execution_count":64,"id":"2a1659c0","metadata":{"tags":[]},"outputs":[{"data":{"text/plain":["Document(page_content=\"Saxena and Min Sun here did, wh ich is given an image like this, right? This is actually a \\npicture taken of the Stanford campus. You can apply that sort of cl ustering algorithm and \\ngroup the picture into regions. Let me actually blow that up so that you can see it more \\nclearly. Okay. So in the middle, you see the lines sort of groupi ng the image together, \\ngrouping the image into [inaudible] regions. \\nAnd what Ashutosh and Min did was they then applied the learning algorithm to say can \\nwe take this clustering and us e it to build a 3D model of the world? And so using the \\nclustering, they then had a lear ning algorithm try to learn what the 3D structure of the \\nworld looks like so that they could come up with a 3D model that you can sort of fly \\nthrough, okay? Although many people used to th ink it's not possible to take a single \\nimage and build a 3D model, but using a lear ning algorithm and that sort of clustering \\nalgorithm is the first step. They were able to. \\nI'll just show you one more example. I like this because it's a picture of Stanford with our \\nbeautiful Stanford campus. So again, taking th e same sort of clustering algorithms, taking \\nthe same sort of unsupervised learning algor ithm, you can group the pixels into different \\nregions. And using that as a pre-processing step, they eventually built this sort of 3D model of Stanford campus in a single picture. You can sort of walk into the ceiling, look\")"]},"execution_count":64,"metadata":{},"output_type":"execute_result"}],"source":["question = \"what did they say about matlab?\" # 他们关于Matlab说了些什么?\n","docs_tfidf = tfidf_retriever.get_relevant_documents(question)\n","docs_tfidf[0]"]},{"cell_type":"code","execution_count":65,"id":"5ade9c87","metadata":{},"outputs":[],"source":["# 中文版\n","svm_retriever_chinese = SVMRetriever.from_texts(splits_chinese, embedding)\n","tfidf_retriever_chinese = TFIDFRetriever.from_texts(splits_chinese)"]},{"cell_type":"code","execution_count":66,"id":"cc823bea","metadata":{},"outputs":[{"data":{"text/plain":["Document(page_content='第⼀回:Matplotlib 初相识\\n⼀、认识matplotlib\\nMatplotlib 是⼀个 Python 2D 绘图库,能够以多种硬拷⻉格式和跨平台的交互式环境⽣成出版物质量的图形,⽤来绘制各种静态,动态,\\n交互式的图表。\\nMatplotlib 可⽤于 Python 脚本, Python 和 IPython Shell 、 Jupyter notebook , Web 应⽤程序服务器和各种图形⽤户界⾯⼯具包等。\\nMatplotlib 是 Python 数据可视化库中的泰⽃,它已经成为 python 中公认的数据可视化⼯具,我们所熟知的 pandas 和 seaborn 的绘图接⼝\\n其实也是基于 matplotlib 所作的⾼级封装。\\n为了对matplotlib 有更好的理解,让我们从⼀些最基本的概念开始认识它,再逐渐过渡到⼀些⾼级技巧中。\\n⼆、⼀个最简单的绘图例⼦\\nMatplotlib 的图像是画在 figure (如 windows , jupyter 窗体)上的,每⼀个 figure ⼜包含了⼀个或多个 axes (⼀个可以指定坐标系的⼦区\\n域)。最简单的创建 figure 以及 axes 的⽅式是通过 pyplot.subplots命令,创建 axes 以后,可以使⽤ Axes.plot绘制最简易的折线图。\\nimport matplotlib.pyplot as plt\\nimport matplotlib as mpl\\nimport numpy as np\\nfig, ax = plt.subplots() # 创建⼀个包含⼀个 axes 的 figure\\nax.plot([1, 2, 3, 4], [1, 4, 2, 3]); # 绘制图像\\nTrick: 在jupyter notebook 中使⽤ matplotlib 时会发现,代码运⾏后⾃动打印出类似 \\n这样⼀段话,这是因为 matplotlib 的绘图代码默认打印出最后⼀个对象。如果不想显示这句话,有以下三种⽅法,在本章节的代码示例\\n中你能找到这三种⽅法的使⽤。\\n\\x00. 在代码块最后加⼀个分号 ;\\n\\x00. 在代码块最后加⼀句 plt.show()\\n\\x00. 在绘图时将绘图对象显式赋值给⼀个变量,如将 plt.plot([1, 2, 3, 4]) 改成 line =plt.plot([1, 2, 3, 4])\\n和MATLAB 命令类似,你还可以通过⼀种更简单的⽅式绘制图像, matplotlib.pyplot⽅法能够直接在当前 axes 上绘制图像,如果⽤户\\n未指定axes , matplotlib 会帮你⾃动创建⼀个。所以上⾯的例⼦也可以简化为以下这⼀⾏代码。\\nline =plt.plot([1, 2, 3, 4], [1, 4, 2, 3]) \\n三、Figure 的组成\\n现在我们来深⼊看⼀下 figure 的组成。通过⼀张 figure 解剖图,我们可以看到⼀个完整的 matplotlib 图像通常会包括以下四个层级,这些\\n层级也被称为容器( container ),下⼀节会详细介绍。在 matplotlib 的世界中,我们将通过各种命令⽅法来操纵图像中的每⼀个部分,\\n从⽽达到数据可视化的最终效果,⼀副完整的图像实际上是各类⼦元素的集合。\\nFigure:顶层级,⽤来容纳所有绘图元素')"]},"execution_count":66,"metadata":{},"output_type":"execute_result"}],"source":["question_chinese = \"这门课的主要主题是什么?\" \n","docs_svm_chinese = svm_retriever_chinese.get_relevant_documents(question_chinese)\n","docs_svm_chinese[0]"]},{"cell_type":"code","execution_count":67,"id":"01eb9d43","metadata":{},"outputs":[{"data":{"text/plain":["Document(page_content='fig, ax = plt.subplots() \\n# step4 绘制图像, 这⼀模块的扩展参考第⼆章进⼀步学习\\nax.plot(x, y, label=\\'linear\\') \\n# step5 添加标签,⽂字和图例,这⼀模块的扩展参考第四章进⼀步学习\\nax.set_xlabel(\\'x label\\') \\nax.set_ylabel(\\'y label\\') \\nax.set_title(\"Simple Plot\") \\nax.legend() ;\\n思考题\\n请思考两种绘图模式的优缺点和各⾃适合的使⽤场景\\n在第五节绘图模板中我们是以 OO 模式作为例⼦展示的,请思考并写⼀个 pyplot 绘图模式的简单模板')"]},"execution_count":67,"metadata":{},"output_type":"execute_result"}],"source":["question_chinese = \"Matplotlib是什么?\"\n","docs_tfidf_chinese = tfidf_retriever_chinese.get_relevant_documents(question_chinese)\n","docs_tfidf_chinese[0]"]}],"metadata":{"kernelspec":{"display_name":"Python 3 (ipykernel)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.9"}},"nbformat":4,"nbformat_minor":5}
+{"cells": [{"attachments": {}, "cell_type": "markdown", "id": "0689733d", "metadata": {}, "source": ["# \u7b2c\u4e94\u7ae0 \u68c0\u7d22(Retrieval)", "\n", " - [\u4e00\u3001\u5411\u91cf\u6570\u636e\u5e93\u68c0\u7d22](#\u4e00\u3001\u5411\u91cf\u6570\u636e\u5e93\u68c0\u7d22)\n", " - [1.1 \u76f8\u4f3c\u6027\u641c\u7d22](#1.1-\u76f8\u4f3c\u6027\u641c\u7d22)\n", " - [1.2 \u89e3\u51b3\u591a\u6837\u6027\uff1a\u6700\u5927\u8fb9\u9645\u76f8\u5173\u6027(MMR)](#1.2-\u89e3\u51b3\u591a\u6837\u6027\uff1a\u6700\u5927\u8fb9\u9645\u76f8\u5173\u6027(MMR))\n", " - [1.3 \u89e3\u51b3\u7279\u6b8a\u6027\uff1a\u4f7f\u7528\u5143\u6570\u636e](#1.3-\u89e3\u51b3\u7279\u6b8a\u6027\uff1a\u4f7f\u7528\u5143\u6570\u636e)\n", " - [1.4 \u89e3\u51b3\u7279\u6b8a\u6027\uff1a\u5728\u5143\u6570\u636e\u4e2d\u4f7f\u7528\u81ea\u67e5\u8be2\u68c0\u7d22\u5668](#1.4-\u89e3\u51b3\u7279\u6b8a\u6027\uff1a\u5728\u5143\u6570\u636e\u4e2d\u4f7f\u7528\u81ea\u67e5\u8be2\u68c0\u7d22\u5668)\n", " - [1.5 \u5176\u4ed6\u6280\u5de7\uff1a\u538b\u7f29](#1.5-\u5176\u4ed6\u6280\u5de7\uff1a\u538b\u7f29)\n", " - [\u4e8c\u3001\u7ed3\u5408\u5404\u79cd\u6280\u672f](#\u4e8c\u3001\u7ed3\u5408\u5404\u79cd\u6280\u672f)\n", " - [\u4e09\u3001\u5176\u4ed6\u7c7b\u578b\u7684\u68c0\u7d22](#\u4e09\u3001\u5176\u4ed6\u7c7b\u578b\u7684\u68c0\u7d22)\n"]}, {"attachments": {}, "cell_type": "markdown", "id": "d12fbd74", "metadata": {}, "source": ["\n", "\u68c0\u7d22\u662f\u6211\u4eec\u7684\u68c0\u7d22\u589e\u5f3a\u751f\u6210(RAG)\u6d41\u7a0b\u7684\u6838\u5fc3\u3002\n", "\n", "\u8ba9\u6211\u4eec\u83b7\u5f97\u524d\u9762\u8bfe\u7a0b\u5b58\u50a8\u7684\u5411\u91cf\u6570\u636e\u5e93(`VectorDB`)\u3002"]}, {"attachments": {}, "cell_type": "markdown", "id": "ed2569c6", "metadata": {}, "source": ["## \u4e00\u3001\u5411\u91cf\u6570\u636e\u5e93\u68c0\u7d22"]}, {"attachments": {}, "cell_type": "markdown", "id": "651a10db", "metadata": {}, "source": ["\u5728\u5f53\u524d\u6587\u4ef6\u5939\u4e0b\u65b0\u5efa`.env`\u6587\u4ef6\uff0c\u5185\u5bb9\u4e3a`OPENAI_API_KEY = \"sk-...\"`\n", "\n", "\u672c\u7ae0\u8282\u9700\u8981\u4f7f\u7528`lark`\u5305\uff0c\u8fd0\u884c\u4ee5\u4e0b\u547d\u4ee4\u5b89\u88c5"]}, {"cell_type": "code", "execution_count": 1, "id": "c18f8a7b-62af-403e-9877-18d1c2265b4f", "metadata": {"tags": []}, "outputs": [], "source": ["!pip install -Uq lark"]}, {"cell_type": "code", "execution_count": 3, "id": "51b15e5c-9b92-4d40-a149-b56335d330d9", "metadata": {"tags": []}, "outputs": [], "source": ["import os\n", "import openai\n", "import sys\n", "sys.path.append('../..')\n", "\n", "from dotenv import load_dotenv, find_dotenv\n", "_ = load_dotenv(find_dotenv()) # read local .env file\n", "\n", "openai.api_key = os.environ['OPENAI_API_KEY']\n"]}, {"attachments": {}, "cell_type": "markdown", "id": "c2d552e1", "metadata": {}, "source": ["### 1.1 \u76f8\u4f3c\u6027\u641c\u7d22"]}, {"cell_type": "code", "execution_count": 4, "id": "fe368042", "metadata": {"tags": []}, "outputs": [], "source": ["from langchain.vectorstores import Chroma\n", "from langchain.embeddings.openai import OpenAIEmbeddings\n", "persist_directory = 'docs/chroma/cs229_lectures/'\n", "persist_directory_chinese = 'docs/chroma/matplotlib/'"]}, {"attachments": {}, "cell_type": "markdown", "id": "5ba63e21", "metadata": {}, "source": ["\u5c06\u4e0a\u8282\u8bfe\u6240\u4fdd\u5b58\u7684\u5411\u91cf\u6570\u636e\u5e93(`VectorDB`)\u52a0\u8f7d\u8fdb\u6765"]}, {"cell_type": "code", "execution_count": 5, "id": "a0189dc5", "metadata": {"tags": []}, "outputs": [], "source": ["embedding = OpenAIEmbeddings()"]}, {"cell_type": "code", "execution_count": 6, "id": "2be10170", "metadata": {}, "outputs": [], "source": ["vectordb = Chroma(\n", " persist_directory=persist_directory,\n", " embedding_function=embedding\n", ")"]}, {"cell_type": "code", "execution_count": 7, "id": "3659e0f7", "metadata": {"tags": []}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["209\n"]}], "source": ["print(vectordb._collection.count())"]}, {"cell_type": "code", "execution_count": 8, "id": "a01ab000", "metadata": {}, "outputs": [], "source": ["vectordb_chinese = Chroma(\n", " persist_directory=persist_directory_chinese,\n", " embedding_function=embedding\n", ")"]}, {"cell_type": "code", "execution_count": 9, "id": "a6998a03", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["27\n"]}], "source": ["print(vectordb_chinese._collection.count())"]}, {"attachments": {}, "cell_type": "markdown", "id": "9ae4fdd8", "metadata": {}, "source": ["\u8ba9\u6211\u4eec\u73b0\u5728\u6765\u770b\u770b\u6700\u5927\u8fb9\u9645\u76f8\u5173\u6027\u7684\u4f8b\u5b50\u3002\u56e0\u6b64\uff0c\u6211\u4eec\u5c06\u4ece\u4e0b\u9762\u793a\u4f8b\u4e2d\u52a0\u8f7d\u6709\u5173\u8611\u83c7\u7684\u4fe1\u606f\u3002\n", "\n", "\u8ba9\u6211\u4eec\u73b0\u5728\u8fd0\u884c\u5b83\u4e0eMMR\u3002\u8ba9\u6211\u4eec\u4f20\u5165k\u7b49\u4e8e2\u3002\u6211\u4eec\u4ecd\u7136\u5e0c\u671b\u8fd4\u56de\u4e24\u4e2a\u6587\u6863\uff0c\u4f46\u8ba9\u6211\u4eec\u8bbe\u7f6e\u83b7\u53d6k\u7b49\u4e8e3\uff0c\u5176\u4e2d\u6211\u4eec\u6700\u521d\u83b7\u53d6\u6240\u6709\u4e09\u4e2a\u6587\u6863\u3002\u6211\u4eec\u73b0\u5728\u53ef\u4ee5\u770b\u5230\uff0c\u6211\u4eec\u68c0\u7d22\u7684\u6587\u6863\u4e2d\u8fd4\u56de\u4e86\u6709\u6bd2\u7684\u4fe1\u606f\u3002"]}, {"cell_type": "code", "execution_count": 10, "id": "a807c758", "metadata": {"tags": []}, "outputs": [], "source": ["texts = [\n", " \"\"\"The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).\"\"\",\n", " \"\"\"A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.\"\"\",\n", " \"\"\"A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.\"\"\",\n", "]"]}, {"cell_type": "code", "execution_count": 11, "id": "b110cceb", "metadata": {}, "outputs": [], "source": ["texts_chinese = [\n", " \"\"\"\u6bd2\u9e45\u818f\u83cc\uff08Amanita phalloides\uff09\u5177\u6709\u5927\u578b\u4e14\u5f15\u4eba\u6ce8\u76ee\u7684\u5730\u4e0a\uff08epigeous\uff09\u5b50\u5b9e\u4f53\uff08basidiocarp\uff09\"\"\",\n", " \"\"\"\u4e00\u79cd\u5177\u6709\u5927\u578b\u5b50\u5b9e\u4f53\u7684\u8611\u83c7\u662f\u6bd2\u9e45\u818f\u83cc\uff08Amanita phalloides\uff09\u3002\u67d0\u4e9b\u54c1\u79cd\u5168\u767d\u3002\"\"\",\n", " \"\"\"A. phalloides\uff0c\u53c8\u540d\u6b7b\u4ea1\u5e3d\uff0c\u662f\u5df2\u77e5\u6240\u6709\u8611\u83c7\u4e2d\u6700\u6709\u6bd2\u7684\u4e00\u79cd\u3002\"\"\",\n", "]"]}, {"attachments": {}, "cell_type": "markdown", "id": "84cd5f1c", "metadata": {}, "source": ["\u5bf9\u4e8e\u8fd9\u4e2a\u4f8b\u5b50\uff0c\u6211\u4eec\u5c06\u521b\u5efa\u4e00\u4e2a\u5c0f\u6570\u636e\u5e93\uff0c\u6211\u4eec\u53ef\u4ee5\u4f5c\u4e3a\u4e00\u4e2a\u793a\u4f8b\u6765\u4f7f\u7528\u3002"]}, {"cell_type": "code", "execution_count": 12, "id": "715d54f3", "metadata": {"tags": []}, "outputs": [{"name": "stderr", "output_type": "stream", "text": ["100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00<00:00, 1.28it/s]\n"]}], "source": ["smalldb = Chroma.from_texts(texts, embedding=embedding)"]}, {"cell_type": "code", "execution_count": 13, "id": "305e1714", "metadata": {}, "outputs": [{"name": "stderr", "output_type": "stream", "text": ["100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 1/1 [00:00<00:00, 2.30it/s]\n"]}], "source": ["smalldb_chinese = Chroma.from_texts(texts_chinese, embedding=embedding)"]}, {"attachments": {}, "cell_type": "markdown", "id": "239a8d95", "metadata": {}, "source": ["\u4e0b\u9762\u662f\u6211\u4eec\u5bf9\u4e8e\u8fd9\u4e2a\u793a\u4f8b\u6240\u63d0\u51fa\u7684\u95ee\u9898"]}, {"cell_type": "code", "execution_count": 14, "id": "9a37b5a5", "metadata": {"tags": []}, "outputs": [], "source": ["question = \"Tell me about all-white mushrooms with large fruiting bodies\""]}, {"cell_type": "code", "execution_count": 15, "id": "92312e57", "metadata": {}, "outputs": [], "source": ["question_chinese = \"\u544a\u8bc9\u6211\u5173\u4e8e\u5177\u6709\u5927\u578b\u5b50\u5b9e\u4f53\u7684\u5168\u767d\u8272\u8611\u83c7\u7684\u4fe1\u606f\""]}, {"attachments": {}, "cell_type": "markdown", "id": "d3224a6d", "metadata": {}, "source": ["\u73b0\u5728\uff0c\u6211\u4eec\u53ef\u4ee5\u8fd0\u884c\u4e00\u4e2a\u76f8\u4f3c\u6027\u641c\u7d22\uff0c\u8bbe\u7f6ek=2\uff0c\u53ea\u8fd4\u56de\u4e24\u4e2a\u6700\u76f8\u5173\u7684\u6587\u6863\u3002\n", "\n", "\u6211\u4eec\u53ef\u4ee5\u770b\u5230\uff0c\u6ca1\u6709\u63d0\u5230\u5b83\u662f\u6709\u6bd2\u7684\u4e8b\u5b9e\u3002"]}, {"cell_type": "code", "execution_count": 16, "id": "24e3b025", "metadata": {"tags": []}, "outputs": [{"data": {"text/plain": ["[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.', metadata={}),\n", " Document(page_content='The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).', metadata={})]"]}, "execution_count": 16, "metadata": {}, "output_type": "execute_result"}], "source": ["smalldb.similarity_search(question, k=2)"]}, {"cell_type": "code", "execution_count": 17, "id": "d4c5a47d", "metadata": {}, "outputs": [{"data": {"text/plain": ["[Document(page_content='\u4e00\u79cd\u5177\u6709\u5927\u578b\u5b50\u5b9e\u4f53\u7684\u8611\u83c7\u662f\u6bd2\u9e45\u818f\u83cc\uff08Amanita phalloides\uff09\u3002\u67d0\u4e9b\u54c1\u79cd\u5168\u767d\u3002', metadata={}),\n", " Document(page_content='\u6bd2\u9e45\u818f\u83cc\uff08Amanita phalloides\uff09\u5177\u6709\u5927\u578b\u4e14\u5f15\u4eba\u6ce8\u76ee\u7684\u5730\u4e0a\uff08epigeous\uff09\u5b50\u5b9e\u4f53\uff08basidiocarp\uff09', metadata={})]"]}, "execution_count": 17, "metadata": {}, "output_type": "execute_result"}], "source": ["smalldb_chinese.similarity_search(question_chinese, k=2)"]}, {"attachments": {}, "cell_type": "markdown", "id": "bbb0ea94", "metadata": {}, "source": ["\u73b0\u5728\uff0c\u8ba9\u6211\u4eec\u8fd0\u884c\u6700\u5927\u8fb9\u9645\u76f8\u5173\u6027(MMR)\u3002\n", "\n", "\u8bbe\u7f6ek=2\uff0c\u56e0\u4e3a\u6211\u4eec\u4ecd\u7136\u5e0c\u671b\u8fd4\u56de\u4e24\u4e2a\u6587\u6863\u3002\u8bbe\u7f6efetch_k=3\uff0cfetch_k\u662f\u6211\u4eec\u6700\u521d\u83b7\u53d6\u7684\u6240\u6709\u6587\u6863(3\u4e2a)\u3002"]}, {"cell_type": "code", "execution_count": 18, "id": "4daa6c0d", "metadata": {"tags": []}, "outputs": [{"data": {"text/plain": ["[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.', metadata={}),\n", " Document(page_content='A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.', metadata={})]"]}, "execution_count": 18, "metadata": {}, "output_type": "execute_result"}], "source": ["smalldb.max_marginal_relevance_search(question,k=2, fetch_k=3)"]}, {"cell_type": "code", "execution_count": 19, "id": "e15521d2", "metadata": {}, "outputs": [{"data": {"text/plain": ["[Document(page_content='\u4e00\u79cd\u5177\u6709\u5927\u578b\u5b50\u5b9e\u4f53\u7684\u8611\u83c7\u662f\u6bd2\u9e45\u818f\u83cc\uff08Amanita phalloides\uff09\u3002\u67d0\u4e9b\u54c1\u79cd\u5168\u767d\u3002', metadata={}),\n", " Document(page_content='A. phalloides\uff0c\u53c8\u540d\u6b7b\u4ea1\u5e3d\uff0c\u662f\u5df2\u77e5\u6240\u6709\u8611\u83c7\u4e2d\u6700\u6709\u6bd2\u7684\u4e00\u79cd\u3002', metadata={})]"]}, "execution_count": 19, "metadata": {}, "output_type": "execute_result"}], "source": ["smalldb_chinese.max_marginal_relevance_search(question,k=2, fetch_k=3)"]}, {"attachments": {}, "cell_type": "markdown", "id": "e87c5f91", "metadata": {}, "source": ["\u6211\u4eec\u73b0\u5728\u53ef\u4ee5\u770b\u5230\uff0c\u6211\u4eec\u68c0\u7d22\u7684\u6587\u6863\u4e2d\u8fd4\u56de\u4e86\u6709\u6bd2\u7684\u4fe1\u606f\u3002"]}, {"attachments": {}, "cell_type": "markdown", "id": "5a29e8c9", "metadata": {}, "source": ["### 1.2 \u89e3\u51b3\u591a\u6837\u6027\uff1a\u6700\u5927\u8fb9\u9645\u76f8\u5173\u6027(MMR)"]}, {"attachments": {}, "cell_type": "markdown", "id": "a2b5c4ae", "metadata": {}, "source": ["\n", "\u6211\u4eec\u521a\u521a\u901a\u8fc7\u4e00\u4e2a\u793a\u4f8b\u5f15\u51fa\u4e86\u4e00\u4e2a\u95ee\u9898\uff1a\u5982\u4f55\u52a0\u5f3a\u641c\u7d22\u7ed3\u679c\u7684\u591a\u6837\u6027\u3002\n", " \n", "\u6700\u5927\u8fb9\u9645\u76f8\u5173\u6027(`Maximum marginal relevance`)\u8bd5\u56fe\u5728\u67e5\u8be2\u7684\u76f8\u5173\u6027\u548c\u7ed3\u679c\u7684\u591a\u6837\u6027\u4e4b\u95f4\u5b9e\u73b0\u4e24\u5168\u5176\u7f8e\u3002"]}, {"attachments": {}, "cell_type": "markdown", "id": "2360545c", "metadata": {}, "source": ["\u8ba9\u6211\u4eec\u56de\u5230\u4e0a\u4e00\u8282\u8bfe\u7684\u4e00\u4e2a\u4f8b\u5b50\uff0c\u5f53\u6211\u4eec\u901a\u8fc7\u95ee\u9898\u5bf9\u5411\u91cf\u6570\u636e\u5e93\u8fdb\u884c\u76f8\u4f3c\u6027\u641c\u7d22\u540e\n", "\n", "\u6211\u4eec\u53ef\u4ee5\u770b\u770b\u524d\u4e24\u4e2a\u6587\u6863\uff0c\u53ea\u770b\u524d\u51e0\u4e2a\u5b57\u7b26\uff0c\u53ef\u4ee5\u770b\u5230\u5b83\u4eec\u662f\u76f8\u540c\u7684\u3002"]}, {"cell_type": "code", "execution_count": 20, "id": "9bb2c0a9", "metadata": {"tags": []}, "outputs": [], "source": ["question = \"what did they say about matlab?\"\n", "docs_ss = vectordb.similarity_search(question,k=3)"]}, {"cell_type": "code", "execution_count": 21, "id": "f07f8793", "metadata": {"tags": []}, "outputs": [{"data": {"text/plain": ["'those homeworks will be done in either MATLA B or in Octave, which is sort of \u2014 I \\nknow some people '"]}, "execution_count": 21, "metadata": {}, "output_type": "execute_result"}], "source": ["docs_ss[0].page_content[:100]"]}, {"cell_type": "code", "execution_count": 22, "id": "e9f7e165", "metadata": {"tags": []}, "outputs": [{"data": {"text/plain": ["'those homeworks will be done in either MATLA B or in Octave, which is sort of \u2014 I \\nknow some people '"]}, "execution_count": 22, "metadata": {}, "output_type": "execute_result"}], "source": ["docs_ss[1].page_content[:100]"]}, {"cell_type": "code", "execution_count": 23, "id": "e8e142eb", "metadata": {}, "outputs": [], "source": ["question_chinese = \"Matplotlib\u662f\u4ec0\u4e48\uff1f\"\n", "docs_ss_chinese = vectordb_chinese.similarity_search(question_chinese,k=3)"]}, {"cell_type": "code", "execution_count": 24, "id": "cf642f66", "metadata": {}, "outputs": [{"data": {"text/plain": ["'\u7b2c\u2f00\u56de\uff1aMatplotlib \u521d\u76f8\u8bc6\\n\u2f00\u3001\u8ba4\u8bc6matplotlib\\nMatplotlib \u662f\u2f00\u4e2a Python 2D \u7ed8\u56fe\u5e93\uff0c\u80fd\u591f\u4ee5\u591a\u79cd\u786c\u62f7\u2ec9\u683c\u5f0f\u548c\u8de8\u5e73\u53f0\u7684\u4ea4\u4e92\u5f0f\u73af\u5883\u2f63\u6210\u51fa\u7248\u7269\u8d28\u91cf\u7684\u56fe\u5f62\uff0c\u2f64\u6765\u7ed8\u5236\u5404\u79cd'"]}, "execution_count": 24, "metadata": {}, "output_type": "execute_result"}], "source": ["docs_ss_chinese[0].page_content[:100]"]}, {"cell_type": "code", "execution_count": 25, "id": "1e9f5cfe", "metadata": {}, "outputs": [{"data": {"text/plain": ["'\u7b2c\u2f00\u56de\uff1aMatplotlib \u521d\u76f8\u8bc6\\n\u2f00\u3001\u8ba4\u8bc6matplotlib\\nMatplotlib \u662f\u2f00\u4e2a Python 2D \u7ed8\u56fe\u5e93\uff0c\u80fd\u591f\u4ee5\u591a\u79cd\u786c\u62f7\u2ec9\u683c\u5f0f\u548c\u8de8\u5e73\u53f0\u7684\u4ea4\u4e92\u5f0f\u73af\u5883\u2f63\u6210\u51fa\u7248\u7269\u8d28\u91cf\u7684\u56fe\u5f62\uff0c\u2f64\u6765\u7ed8\u5236\u5404\u79cd'"]}, "execution_count": 25, "metadata": {}, "output_type": "execute_result"}], "source": ["docs_ss_chinese[1].page_content[:100]"]}, {"attachments": {}, "cell_type": "markdown", "id": "4c4ca1b6", "metadata": {}, "source": ["\u6ce8\u610f\uff1a\u4f7f\u7528`MMR`\u6240\u5f97\u51fa\u7ed3\u679c\u7684\u5dee\u5f02\u3002"]}, {"cell_type": "code", "execution_count": 26, "id": "222234c5", "metadata": {"tags": []}, "outputs": [], "source": ["docs_mmr = vectordb.max_marginal_relevance_search(question,k=3)"]}, {"cell_type": "code", "execution_count": 27, "id": "408935bc", "metadata": {}, "outputs": [], "source": ["docs_mmr_chinese = vectordb_chinese.max_marginal_relevance_search(question_chinese,k=3)"]}, {"attachments": {}, "cell_type": "markdown", "id": "9076db81", "metadata": {}, "source": ["\u5f53\u6211\u4eec\u8fd0\u884cMMR\u540e\u5f97\u5230\u7ed3\u679c\u65f6\uff0c\u6211\u4eec\u53ef\u4ee5\u770b\u5230\u7b2c\u4e00\u4e2a\u4e0e\u4e4b\u524d\u7684\u76f8\u540c\uff0c\u56e0\u4e3a\u90a3\u662f\u6700\u76f8\u4f3c\u7684\u3002"]}, {"cell_type": "code", "execution_count": 28, "id": "93b20226", "metadata": {"tags": []}, "outputs": [{"data": {"text/plain": ["'those homeworks will be done in either MATLA B or in Octave, which is sort of \u2014 I \\nknow some people '"]}, "execution_count": 28, "metadata": {}, "output_type": "execute_result"}], "source": ["docs_mmr[0].page_content[:100]"]}, {"cell_type": "code", "execution_count": 29, "id": "d0acfaab", "metadata": {}, "outputs": [{"data": {"text/plain": ["'\u7b2c\u2f00\u56de\uff1aMatplotlib \u521d\u76f8\u8bc6\\n\u2f00\u3001\u8ba4\u8bc6matplotlib\\nMatplotlib \u662f\u2f00\u4e2a Python 2D \u7ed8\u56fe\u5e93\uff0c\u80fd\u591f\u4ee5\u591a\u79cd\u786c\u62f7\u2ec9\u683c\u5f0f\u548c\u8de8\u5e73\u53f0\u7684\u4ea4\u4e92\u5f0f\u73af\u5883\u2f63\u6210\u51fa\u7248\u7269\u8d28\u91cf\u7684\u56fe\u5f62\uff0c\u2f64\u6765\u7ed8\u5236\u5404\u79cd'"]}, "execution_count": 29, "metadata": {}, "output_type": "execute_result"}], "source": ["docs_mmr_chinese[0].page_content[:100]"]}, {"attachments": {}, "cell_type": "markdown", "id": "7a93743f", "metadata": {}, "source": ["\u4f46\u662f\u5f53\u6211\u4eec\u8fdb\u884c\u5230\u7b2c\u4e8c\u4e2a\u65f6\uff0c\u6211\u4eec\u53ef\u4ee5\u770b\u5230\u5b83\u662f\u4e0d\u540c\u7684\u3002\n", "\n", "\u5b83\u5728\u56de\u5e94\u4e2d\u83b7\u5f97\u4e86\u4e00\u4e9b\u591a\u6837\u6027\u3002"]}, {"cell_type": "code", "execution_count": 30, "id": "17d39762", "metadata": {"tags": []}, "outputs": [{"data": {"text/plain": ["'algorithm then? So what\u2019s different? How come I was making all that noise earlier about \\nleast squa'"]}, "execution_count": 30, "metadata": {}, "output_type": "execute_result"}], "source": ["docs_mmr[1].page_content[:100]"]}, {"cell_type": "code", "execution_count": 31, "id": "93d3206c", "metadata": {}, "outputs": [{"data": {"text/plain": ["'By Datawhale \u6570\u636e\u53ef\u89c6\u5316\u5f00\u6e90\u2f29\u7ec4\\n\u00a9 Copyright \u00a9 Copyright 2021.y\u8f74\u5206\u4e3a\u5de6\u53f3\u4e24\u4e2a\uff0c\u56e0\u6b64 tick1 \u5bf9\u5e94\u5de6\u4fa7\u7684\u8f74\uff1b tick2 \u5bf9\u5e94\u53f3\u4fa7\u7684\u8f74\u3002\\nx\u8f74\u5206\u4e3a\u4e0a\u4e0b\u4e24\u4e2a'"]}, "execution_count": 31, "metadata": {}, "output_type": "execute_result"}], "source": ["docs_mmr_chinese[1].page_content[:100]"]}, {"attachments": {}, "cell_type": "markdown", "id": "b2b909bc", "metadata": {}, "source": ["### 1.3 \u89e3\u51b3\u7279\u6b8a\u6027\uff1a\u4f7f\u7528\u5143\u6570\u636e"]}, {"attachments": {}, "cell_type": "markdown", "id": "7b63c5ee", "metadata": {}, "source": ["\n", "\u5728\u4e0a\u4e00\u8282\u8bfe\u4e2d\uff0c\u6211\u4eec\u5c55\u793a\u4e86\u4e00\u4e2a\u95ee\u9898\uff0c\u662f\u8be2\u95ee\u4e86\u5173\u4e8e\u6587\u6863\u4e2d\u67d0\u4e00\u8bb2\u7684\u95ee\u9898\uff0c\u4f46\u5f97\u5230\u7684\u7ed3\u679c\u4e2d\u4e5f\u5305\u62ec\u4e86\u6765\u81ea\u5176\u4ed6\u8bb2\u7684\u7ed3\u679c\u3002\n", "\n", "\u4e3a\u4e86\u89e3\u51b3\u8fd9\u4e00\u95ee\u9898\uff0c\u5f88\u591a\u5411\u91cf\u6570\u636e\u5e93\u90fd\u652f\u6301\u5bf9`metadata`\u7684\u64cd\u4f5c\u3002\n", "\n", "`metadata`\u4e3a\u6bcf\u4e2a\u5d4c\u5165\u7684\u5757(embedded chunk)\u63d0\u4f9b\u4e0a\u4e0b\u6587\u3002"]}, {"cell_type": "code", "execution_count": 32, "id": "3c1a60b2", "metadata": {"tags": []}, "outputs": [], "source": ["question = \"what did they say about regression in the third lecture?\""]}, {"cell_type": "code", "execution_count": 33, "id": "ba98df3c", "metadata": {}, "outputs": [], "source": ["question_chinese = \"\u4ed6\u4eec\u5728\u7b2c\u4e8c\u8bb2\u4e2d\u5bf9Figure\u8bf4\u4e86\u4e9b\u4ec0\u4e48\uff1f\" "]}, {"attachments": {}, "cell_type": "markdown", "id": "3873525e", "metadata": {}, "source": ["\u73b0\u5728\uff0c\u6211\u4eec\u4ee5\u624b\u52a8\u7684\u65b9\u5f0f\u6765\u89e3\u51b3\u8fd9\u4e2a\u95ee\u9898\uff0c\u6211\u4eec\u4f1a\u6307\u5b9a\u4e00\u4e2a\u5143\u6570\u636e\u8fc7\u6ee4\u5668`filter`"]}, {"cell_type": "code", "execution_count": 34, "id": "a8612840", "metadata": {"tags": []}, "outputs": [], "source": ["docs = vectordb.similarity_search(\n", " question,\n", " k=3,\n", " filter={\"source\":\"docs/cs229_lectures/MachineLearning-Lecture03.pdf\"}\n", ")"]}, {"cell_type": "code", "execution_count": 35, "id": "b46c7e76", "metadata": {}, "outputs": [], "source": ["docs_chinese = vectordb_chinese.similarity_search(\n", " question_chinese,\n", " k=3,\n", " filter={\"source\":\"docs/matplotlib/\u7b2c\u4e8c\u56de\uff1a\u827a\u672f\u753b\u7b14\u89c1\u4e7e\u5764.pdf\"}\n", ")"]}, {"attachments": {}, "cell_type": "markdown", "id": "869aee28", "metadata": {}, "source": ["\u63a5\u4e0b\u6765\uff0c\u6211\u4eec\u53ef\u4ee5\u770b\u5230\u7ed3\u679c\u90fd\u6765\u81ea\u5bf9\u5e94\u7684\u7ae0\u8282"]}, {"cell_type": "code", "execution_count": 36, "id": "97031876", "metadata": {"tags": []}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 0}\n", "{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 14}\n", "{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 4}\n"]}], "source": ["for d in docs:\n", " print(d.metadata)"]}, {"cell_type": "code", "execution_count": 35, "id": "2708f6ae", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["{'source': 'docs/matplotlib/\u7b2c\u4e8c\u56de\uff1a\u827a\u672f\u753b\u7b14\u89c1\u4e7e\u5764.pdf', 'page': 9}\n", "{'source': 'docs/matplotlib/\u7b2c\u4e8c\u56de\uff1a\u827a\u672f\u753b\u7b14\u89c1\u4e7e\u5764.pdf', 'page': 10}\n", "{'source': 'docs/matplotlib/\u7b2c\u4e8c\u56de\uff1a\u827a\u672f\u753b\u7b14\u89c1\u4e7e\u5764.pdf', 'page': 0}\n"]}], "source": ["for d in docs_chinese:\n", " print(d.metadata)\n", " "]}, {"attachments": {}, "cell_type": "markdown", "id": "5e299f8e", "metadata": {}, "source": ["\u5f53\u7136\uff0c\u6211\u4eec\u4e0d\u80fd\u6bcf\u6b21\u90fd\u91c7\u7528\u624b\u52a8\u7684\u65b9\u5f0f\u6765\u89e3\u51b3\u8fd9\u4e2a\u95ee\u9898\uff0c\u8fd9\u4f1a\u663e\u5f97\u4e0d\u591f\u667a\u80fd\n", "\n", "\u4e0b\u4e00\u5c0f\u8282\u5c06\u8981\u5c55\u793a\u901a\u8fc7LLM\u6765\u89e3\u51b3\u8fd9\u4e2a\u95ee\u9898"]}, {"attachments": {}, "cell_type": "markdown", "id": "ccc2d784", "metadata": {}, "source": ["### 1.4 \u89e3\u51b3\u7279\u6b8a\u6027\uff1a\u5728\u5143\u6570\u636e\u4e2d\u4f7f\u7528\u81ea\u67e5\u8be2\u68c0\u7d22\u5668"]}, {"attachments": {}, "cell_type": "markdown", "id": "82ef44b6", "metadata": {}, "source": ["\u6211\u4eec\u6709\u4e00\u4e2a\u6709\u8da3\u7684\u6311\u6218\uff1a\u6211\u4eec\u901a\u5e38\u5e0c\u671b\u4ece\u67e5\u8be2\u672c\u8eab\u6765\u63a8\u65ad\u5143\u6570\u636e\u3002\n", "\n", "\u4e3a\u4e86\u89e3\u51b3\u8fd9\u4e2a\u95ee\u9898\uff0c\u6211\u4eec\u53ef\u4ee5\u4f7f\u7528SelfQueryRetriever\uff0c\u5b83\u4f7f\u7528LLM\u6765\u63d0\u53d6\uff1a\n", " \n", "1. \u7528\u4e8e\u5411\u91cf\u641c\u7d22\u7684\u67e5\u8be2(`query`)\u5b57\u7b26\u4e32\uff0c\u5373\uff1a\u95ee\u9898\n", "2. \u8981\u4e00\u8d77\u4f20\u5165\u7684\u5143\u6570\u636e\u8fc7\u6ee4\u5668\n", "\n", "\u5927\u591a\u6570\u5411\u91cf\u6570\u636e\u5e93\u652f\u6301\u5143\u6570\u636e\u8fc7\u6ee4\u5668\uff0c\u56e0\u6b64\u4e0d\u9700\u8981\u4efb\u4f55\u65b0\u7684\u6570\u636e\u5e93\u53ca\u7d22\u5f15\u3002"]}, {"cell_type": "code", "execution_count": 38, "id": "b1d06084", "metadata": {"tags": []}, "outputs": [], "source": ["from langchain.llms import OpenAI\n", "from langchain.retrievers.self_query.base import SelfQueryRetriever\n", "from langchain.chains.query_constructor.base import AttributeInfo"]}, {"cell_type": "code", "execution_count": 39, "id": "869c27c0", "metadata": {}, "outputs": [], "source": ["llm = OpenAI(temperature=0)"]}, {"attachments": {}, "cell_type": "markdown", "id": "acd194c5", "metadata": {}, "source": ["`AttributeInfo`\u662f\u6211\u4eec\u53ef\u4ee5\u6307\u5b9a\u5143\u6570\u636e\u4e2d\u7684\u4e0d\u540c\u5b57\u6bb5\u4ee5\u53ca\u5b83\u4eec\u5bf9\u5e94\u7684\u4f4d\u7f6e\u3002\n", "\n", "\u5728\u5143\u6570\u636e\u4e2d\uff0c\u6211\u4eec\u53ea\u6709\u4e24\u4e2a\u5b57\u6bb5\uff0c`source`\u548c`page`\u3002\n", "\n", "\u6211\u4eec\u5c06\u586b\u5199\u6bcf\u4e2a\u5c5e\u6027\u7684\u540d\u79f0\u3001\u63cf\u8ff0\u548c\u7c7b\u578b\u7684\u63cf\u8ff0\u3002\n", "\n", "\u8fd9\u4e9b\u4fe1\u606f\u5b9e\u9645\u4e0a\u5c06\u88ab\u4f20\u9012\u7ed9LLM\uff0c\u6240\u4ee5\u9700\u8981\u5c3d\u53ef\u80fd\u8be6\u7ec6\u5730\u63cf\u8ff0\u3002"]}, {"cell_type": "code", "execution_count": 40, "id": "0aa5e698", "metadata": {"tags": []}, "outputs": [], "source": ["metadata_field_info = [\n", " AttributeInfo(\n", " name=\"source\",\n", " description=\"The lecture the chunk is from, should be one of `docs/cs229_lectures/MachineLearning-Lecture01.pdf`, `docs/cs229_lectures/MachineLearning-Lecture02.pdf`, or `docs/cs229_lectures/MachineLearning-Lecture03.pdf`\",\n", " type=\"string\",\n", " ),\n", " AttributeInfo(\n", " name=\"page\",\n", " description=\"The page from the lecture\",\n", " type=\"integer\",\n", " ),\n", "]"]}, {"cell_type": "code", "execution_count": 41, "id": "544ad7c1", "metadata": {}, "outputs": [], "source": ["metadata_field_info_chinese = [\n", " AttributeInfo(\n", " name=\"source\",\n", " description=\"\u8bb2\u4e49\u6765\u6e90\u4e8e `docs/matplotlib/\u7b2c\u4e00\u56de\uff1aMatplotlib\u521d\u76f8\u8bc6.pdf`, `docs/matplotlib/\u7b2c\u4e8c\u56de\uff1a\u827a\u672f\u753b\u7b14\u89c1\u4e7e\u5764.pdf`, or `docs/matplotlib/\u7b2c\u4e09\u56de\uff1a\u5e03\u5c40\u683c\u5f0f\u5b9a\u65b9\u5706.pdf` \u7684\u5176\u4e2d\u4e4b\u4e00\",\n", " type=\"string\",\n", " ),\n", " AttributeInfo(\n", " name=\"page\",\n", " description=\"\u8bb2\u4e49\u7684\u90a3\u4e00\u9875\",\n", " type=\"integer\",\n", " ),\n", "]"]}, {"cell_type": "code", "execution_count": 42, "id": "e7906c15", "metadata": {"tags": []}, "outputs": [], "source": ["document_content_description = \"Lecture notes\"\n", "retriever = SelfQueryRetriever.from_llm(\n", " llm,\n", " vectordb,\n", " document_content_description,\n", " metadata_field_info,\n", " verbose=True\n", ")"]}, {"cell_type": "code", "execution_count": 43, "id": "d5b99571", "metadata": {}, "outputs": [], "source": ["document_content_description_chinese = \"\u8bfe\u5802\u8bb2\u4e49\"\n", "retriever_chinese = SelfQueryRetriever.from_llm(\n", " llm,\n", " vectordb_chinese,\n", " document_content_description_chinese,\n", " metadata_field_info_chinese,\n", " verbose=True\n", ")"]}, {"cell_type": "code", "execution_count": 44, "id": "79d781b9", "metadata": {"tags": []}, "outputs": [], "source": ["question = \"what did they say about regression in the third lecture?\""]}, {"cell_type": "code", "execution_count": 45, "id": "8d9b7e18", "metadata": {}, "outputs": [], "source": ["question_chinese = \"\u4ed6\u4eec\u5728\u7b2c\u4e8c\u8bb2\u4e2d\u5bf9Figure\u8bf4\u4e86\u4e9b\u4ec0\u4e48\uff1f\" "]}, {"attachments": {}, "cell_type": "markdown", "id": "c51778b0-1fcd-40a4-bd6b-0f13fec8acb1", "metadata": {}, "source": ["\u5f53\u4f60\u7b2c\u4e00\u6b21\u6267\u884c\u4e0b\u4e00\u884c\u65f6\uff0c\u4f60\u4f1a\u6536\u5230\u5173\u4e8epredict_and_parse\u5df2\u88ab\u5f03\u7528\u7684**\u8b66\u544a**\u3002 \u8fd9\u53ef\u4ee5\u5b89\u5168\u5730\u5ffd\u7565\u3002"]}, {"cell_type": "code", "execution_count": 46, "id": "1d4f9f7d", "metadata": {"tags": []}, "outputs": [{"name": "stderr", "output_type": "stream", "text": ["/root/autodl-tmp/env/gpt/lib/python3.10/site-packages/langchain/chains/llm.py:275: UserWarning: The predict_and_parse method is deprecated, instead pass an output parser directly to LLMChain.\n", " warnings.warn(\n"]}, {"name": "stdout", "output_type": "stream", "text": ["query='regression' filter=Comparison(comparator=, attribute='source', value='docs/cs229_lectures/MachineLearning-Lecture03.pdf') limit=None\n"]}], "source": ["docs = retriever.get_relevant_documents(question)"]}, {"cell_type": "code", "execution_count": 47, "id": "ea39a97e", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["query='Figure' filter=Comparison(comparator=, attribute='source', value='docs/matplotlib/\u7b2c\u4e8c\u8bb2\uff1a\u827a\u672f\u753b\u89e3\u7834.pdf') limit=None\n"]}], "source": ["docs_chinese = retriever_chinese.get_relevant_documents(question_chinese)"]}, {"cell_type": "code", "execution_count": 48, "id": "db04374e", "metadata": {"tags": []}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 14}\n", "{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 0}\n", "{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 10}\n", "{'source': 'docs/cs229_lectures/MachineLearning-Lecture03.pdf', 'page': 10}\n"]}], "source": ["for d in docs:\n", " print(d.metadata)"]}, {"cell_type": "code", "execution_count": 49, "id": "143061f5", "metadata": {}, "outputs": [], "source": ["for d in docs_chinese:\n", " print(d.metadata)"]}, {"attachments": {}, "cell_type": "markdown", "id": "297b8168", "metadata": {}, "source": ["### 1.5 \u5176\u4ed6\u6280\u5de7\uff1a\u538b\u7f29"]}, {"attachments": {}, "cell_type": "markdown", "id": "564144da", "metadata": {}, "source": ["\u53e6\u4e00\u79cd\u63d0\u9ad8\u68c0\u7d22\u5230\u7684\u6587\u6863\u8d28\u91cf\u7684\u65b9\u6cd5\u662f\u538b\u7f29\u3002\n", "\n", "\u4e0e\u67e5\u8be2\u6700\u76f8\u5173\u7684\u4fe1\u606f\u53ef\u80fd\u9690\u85cf\u5728\u5177\u6709\u5927\u91cf\u4e0d\u76f8\u5173\u6587\u672c\u7684\u6587\u6863\u4e2d\u3002\n", "\n", "\u5728\u5e94\u7528\u7a0b\u5e8f\u4e2d\u4f20\u9012\u5b8c\u6574\u7684\u6587\u6863\u53ef\u80fd\u4f1a\u5bfc\u81f4\u66f4\u6602\u8d35\u7684LLM\u8c03\u7528\u548c\u66f4\u5dee\u7684\u54cd\u5e94\u3002\n", "\n", "\u4e0a\u4e0b\u6587\u538b\u7f29\u5c31\u662f\u4e3a\u4e86\u89e3\u51b3\u8fd9\u4e2a\u95ee\u9898\u3002"]}, {"cell_type": "code", "execution_count": 50, "id": "a060cf74", "metadata": {"tags": []}, "outputs": [], "source": ["from langchain.retrievers import ContextualCompressionRetriever\n", "from langchain.retrievers.document_compressors import LLMChainExtractor"]}, {"cell_type": "code", "execution_count": 51, "id": "038649c8", "metadata": {"tags": []}, "outputs": [], "source": ["def pretty_print_docs(docs):\n", " print(f\"\\n{'-' * 100}\\n\".join([f\"Document {i+1}:\\n\\n\" + d.page_content for i, d in enumerate(docs)]))"]}, {"cell_type": "code", "execution_count": 52, "id": "fc686cf2", "metadata": {"tags": []}, "outputs": [], "source": ["llm = OpenAI(temperature=0)\n", "compressor = LLMChainExtractor.from_llm(llm) # \u538b\u7f29\u5668"]}, {"cell_type": "code", "execution_count": 53, "id": "82794397", "metadata": {"tags": []}, "outputs": [], "source": ["compression_retriever = ContextualCompressionRetriever(\n", " base_compressor=compressor,\n", " base_retriever=vectordb.as_retriever()\n", ")"]}, {"cell_type": "code", "execution_count": 54, "id": "915598f8", "metadata": {}, "outputs": [], "source": ["compression_retriever_chinese = ContextualCompressionRetriever(\n", " base_compressor=compressor,\n", " base_retriever=vectordb_chinese.as_retriever()\n", ")"]}, {"cell_type": "code", "execution_count": 55, "id": "cde86848", "metadata": {"tags": []}, "outputs": [{"name": "stderr", "output_type": "stream", "text": ["/root/autodl-tmp/env/gpt/lib/python3.10/site-packages/langchain/chains/llm.py:275: UserWarning: The predict_and_parse method is deprecated, instead pass an output parser directly to LLMChain.\n", " warnings.warn(\n"]}, {"name": "stdout", "output_type": "stream", "text": ["Document 1:\n", "\n", "\"MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to plot data. And it's sort of an extremely easy to learn tool to use for implementing a lot of learning algorithms.\"\n", "----------------------------------------------------------------------------------------------------\n", "Document 2:\n", "\n", "\"MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to plot data. And it's sort of an extremely easy to learn tool to use for implementing a lot of learning algorithms.\"\n", "----------------------------------------------------------------------------------------------------\n", "Document 3:\n", "\n", "\"And the student said, \"Oh, it was the MATLAB.\" So for those of you that don't know MATLAB yet, I hope you do learn it. It's not hard, and we'll actually have a short MATLAB tutorial in one of the discussion sections for those of you that don't know it.\"\n", "----------------------------------------------------------------------------------------------------\n", "Document 4:\n", "\n", "\"And the student said, \"Oh, it was the MATLAB.\" So for those of you that don't know MATLAB yet, I hope you do learn it. It's not hard, and we'll actually have a short MATLAB tutorial in one of the discussion sections for those of you that don't know it.\"\n"]}], "source": ["question = \"what did they say about matlab?\"\n", "compressed_docs = compression_retriever.get_relevant_documents(question)\n", "pretty_print_docs(compressed_docs)"]}, {"cell_type": "code", "execution_count": 56, "id": "39726b24", "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["Document 1:\n", "\n", "Matplotlib \u662f\u2f00\u4e2a Python 2D \u7ed8\u56fe\u5e93\uff0c\u80fd\u591f\u4ee5\u591a\u79cd\u786c\u62f7\u2ec9\u683c\u5f0f\u548c\u8de8\u5e73\u53f0\u7684\u4ea4\u4e92\u5f0f\u73af\u5883\u2f63\u6210\u51fa\u7248\u7269\u8d28\u91cf\u7684\u56fe\u5f62\uff0c\u2f64\u6765\u7ed8\u5236\u5404\u79cd\u9759\u6001\uff0c\u52a8\u6001\uff0c\u4ea4\u4e92\u5f0f\u7684\u56fe\u8868\u3002\n", "----------------------------------------------------------------------------------------------------\n", "Document 2:\n", "\n", "Matplotlib \u662f\u2f00\u4e2a Python 2D \u7ed8\u56fe\u5e93\uff0c\u80fd\u591f\u4ee5\u591a\u79cd\u786c\u62f7\u2ec9\u683c\u5f0f\u548c\u8de8\u5e73\u53f0\u7684\u4ea4\u4e92\u5f0f\u73af\u5883\u2f63\u6210\u51fa\u7248\u7269\u8d28\u91cf\u7684\u56fe\u5f62\uff0c\u2f64\u6765\u7ed8\u5236\u5404\u79cd\u9759\u6001\uff0c\u52a8\u6001\uff0c\u4ea4\u4e92\u5f0f\u7684\u56fe\u8868\u3002\n"]}], "source": ["question_chinese = \"Matplotlib\u662f\u4ec0\u4e48\uff1f\"\n", "compressed_docs_chinese = compression_retriever_chinese.get_relevant_documents(question_chinese)\n", "pretty_print_docs(compressed_docs_chinese)"]}, {"attachments": {}, "cell_type": "markdown", "id": "049b2601", "metadata": {}, "source": ["\u73b0\u5728\u5f53\u6211\u4eec\u63d0\u51fa\u95ee\u9898\u540e\uff0c\u67e5\u770b\u7ed3\u679c\u6587\u6863\n", "\n", "\u6211\u4eec\u53ef\u4ee5\u770b\u5230\u4e24\u4ef6\u4e8b\u3002\n", "\n", "1. \u5b83\u4eec\u6bd4\u6b63\u5e38\u6587\u6863\u77ed\u5f88\u591a\n", "2. \u4ecd\u7136\u6709\u4e00\u4e9b\u91cd\u590d\u7684\u4e1c\u897f\uff0c\u8fd9\u662f\u56e0\u4e3a\u5728\u5e95\u5c42\u6211\u4eec\u4f7f\u7528\u7684\u662f\u8bed\u4e49\u641c\u7d22\u7b97\u6cd5\u3002\n", "\n", "\u8fd9\u5c31\u662f\u6211\u4eec\u5728\u672c\u8bfe\u7a0b\u524d\u9762\u4f7f\u7528MMR\u89e3\u51b3\u7684\u95ee\u9898\u3002\n", "\n", "\u8fd9\u662f\u4e00\u4e2a\u5f88\u597d\u7684\u4f8b\u5b50\uff0c\u4f60\u53ef\u4ee5\u7ed3\u5408\u5404\u79cd\u6280\u672f\u5f97\u5230\u6700\u597d\u7684\u53ef\u80fd\u7ed3\u679c\u3002"]}, {"attachments": {}, "cell_type": "markdown", "id": "82c4fc4d", "metadata": {}, "source": ["## \u4e8c\u3001\u7ed3\u5408\u5404\u79cd\u6280\u672f"]}, {"attachments": {}, "cell_type": "markdown", "id": "54432975", "metadata": {}, "source": ["\u4e3a\u4e86\u505a\u5230\u8fd9\u4e00\u70b9\uff0c\u6211\u4eec\u5728\u4ece\u5411\u91cf\u6570\u636e\u5e93\u521b\u5efa\u68c0\u7d22\u5668\u65f6\uff0c\u53ef\u4ee5\u5c06\u641c\u7d22\u7c7b\u578b\u8bbe\u7f6e\u4e3aMMR\u3002\n", "\n", "\u7136\u540e\u6211\u4eec\u53ef\u4ee5\u91cd\u65b0\u8fd0\u884c\u8fd9\u4e2a\u8fc7\u7a0b\uff0c\u770b\u5230\u6211\u4eec\u8fd4\u56de\u7684\u662f\u4e00\u4e2a\u8fc7\u6ee4\u8fc7\u7684\u7ed3\u679c\u96c6\uff0c\u5176\u4e2d\u4e0d\u5305\u542b\u4efb\u4f55\u91cd\u590d\u7684\u4fe1\u606f\u3002"]}, {"cell_type": "code", "execution_count": 57, "id": "161ae1ad", "metadata": {"tags": []}, "outputs": [], "source": ["compression_retriever = ContextualCompressionRetriever(\n", " base_compressor=compressor,\n", " base_retriever=vectordb.as_retriever(search_type = \"mmr\")\n", ")"]}, {"cell_type": "code", "execution_count": 58, "id": "cd6396bb", "metadata": {}, "outputs": [], "source": ["compression_retriever_chinese = ContextualCompressionRetriever(\n", " base_compressor=compressor,\n", " base_retriever=vectordb_chinese.as_retriever(search_type = \"mmr\")\n", ")"]}, {"cell_type": "code", "execution_count": 59, "id": "e77ccae1", "metadata": {"tags": []}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["Document 1:\n", "\n", "\"MATLAB is I guess part of the programming language that makes it very easy to write codes using matrices, to write code for numerical routines, to move data around, to plot data. And it's sort of an extremely easy to learn tool to use for implementing a lot of learning algorithms.\"\n", "----------------------------------------------------------------------------------------------------\n", "Document 2:\n", "\n", "\"And the student said, \"Oh, it was the MATLAB.\" So for those of you that don't know MATLAB yet, I hope you do learn it. It's not hard, and we'll actually have a short MATLAB tutorial in one of the discussion sections for those of you that don't know it.\"\n"]}], "source": ["question = \"what did they say about matlab?\"\n", "compressed_docs = compression_retriever.get_relevant_documents(question)\n", "pretty_print_docs(compressed_docs)"]}, {"cell_type": "code", "execution_count": 60, "id": "fe68a14b", "metadata": {}, "outputs": [{"name": "stderr", "output_type": "stream", "text": ["Retrying langchain.llms.openai.completion_with_retry.._completion_with_retry in 4.0 seconds as it raised APIError: Request failed due to server shutdown {\n", " \"error\": {\n", " \"message\": \"Request failed due to server shutdown\",\n", " \"type\": \"server_error\",\n", " \"param\": null,\n", " \"code\": null\n", " }\n", "}\n", " 500 {'error': {'message': 'Request failed due to server shutdown', 'type': 'server_error', 'param': None, 'code': None}} {'Date': 'Sun, 16 Jul 2023 05:28:06 GMT', 'Content-Type': 'application/json', 'Content-Length': '141', 'Connection': 'keep-alive', 'access-control-allow-origin': '*', 'openai-model': 'text-davinci-003', 'openai-organization': 'user-xnghkpntwvm31crtmex7n2j0', 'openai-processing-ms': '1159', 'openai-version': '2020-10-01', 'strict-transport-security': 'max-age=15724800; includeSubDomains', 'x-ratelimit-limit-requests': '3000', 'x-ratelimit-limit-tokens': '250000', 'x-ratelimit-remaining-requests': '2999', 'x-ratelimit-remaining-tokens': '249744', 'x-ratelimit-reset-requests': '20ms', 'x-ratelimit-reset-tokens': '61ms', 'x-request-id': '22b2986140ddb5bcc689423e056b7daf', 'CF-Cache-Status': 'DYNAMIC', 'Server': 'cloudflare', 'CF-RAY': '7e77d6f7590b1ec8-NRT', 'alt-svc': 'h3=\":443\"; ma=86400'}.\n"]}, {"name": "stdout", "output_type": "stream", "text": ["Document 1:\n", "\n", "Matplotlib \u662f\u2f00\u4e2a Python 2D \u7ed8\u56fe\u5e93\uff0c\u80fd\u591f\u4ee5\u591a\u79cd\u786c\u62f7\u2ec9\u683c\u5f0f\u548c\u8de8\u5e73\u53f0\u7684\u4ea4\u4e92\u5f0f\u73af\u5883\u2f63\u6210\u51fa\u7248\u7269\u8d28\u91cf\u7684\u56fe\u5f62\uff0c\u2f64\u6765\u7ed8\u5236\u5404\u79cd\u9759\u6001\uff0c\u52a8\u6001\uff0c\u4ea4\u4e92\u5f0f\u7684\u56fe\u8868\u3002\n"]}], "source": ["question_chinese = \"Matplotlib\u662f\u4ec0\u4e48\uff1f\"\n", "compressed_docs_chinese = compression_retriever_chinese.get_relevant_documents(question_chinese)\n", "pretty_print_docs(compressed_docs_chinese)"]}, {"attachments": {}, "cell_type": "markdown", "id": "6c2b63e1", "metadata": {}, "source": ["## \u4e09\u3001\u5176\u4ed6\u7c7b\u578b\u7684\u68c0\u7d22"]}, {"attachments": {}, "cell_type": "markdown", "id": "3e777a7b", "metadata": {}, "source": ["\u503c\u5f97\u6ce8\u610f\u7684\u662f\uff0cvetordb\u5e76\u4e0d\u662f\u552f\u4e00\u4e00\u79cd\u68c0\u7d22\u6587\u6863\u7684\u5de5\u5177\u3002\n", "\n", "`LangChain`\u68c0\u7d22\u5668\u62bd\u8c61\u5305\u62ec\u5176\u4ed6\u68c0\u7d22\u6587\u6863\u7684\u65b9\u5f0f\uff0c\u5982\uff1a`TF-IDF` \u6216 `SVM`\u3002"]}, {"cell_type": "code", "execution_count": 61, "id": "83d2e808", "metadata": {"tags": []}, "outputs": [], "source": ["from langchain.retrievers import SVMRetriever\n", "from langchain.retrievers import TFIDFRetriever\n", "from langchain.document_loaders import PyPDFLoader\n", "from langchain.text_splitter import RecursiveCharacterTextSplitter"]}, {"cell_type": "code", "execution_count": 62, "id": "bcf5b760", "metadata": {"tags": []}, "outputs": [], "source": ["# \u52a0\u8f7dPDF\n", "loader = PyPDFLoader(\"docs/cs229_lectures/MachineLearning-Lecture01.pdf\")\n", "pages = loader.load()\n", "all_page_text = [p.page_content for p in pages]\n", "joined_page_text = \" \".join(all_page_text)\n", "\n", "# \u5206\u5272\u6587\u672c\n", "text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500,chunk_overlap = 150)\n", "splits = text_splitter.split_text(joined_page_text)\n"]}, {"cell_type": "code", "execution_count": 77, "id": "1657e768", "metadata": {}, "outputs": [], "source": ["# \u52a0\u8f7dPDF\n", "loader_chinese = PyPDFLoader(\"docs/matplotlib/\u7b2c\u4e00\u56de\uff1aMatplotlib\u521d\u76f8\u8bc6.pdf\")\n", "pages_chinese = loader_chinese.load()\n", "all_page_text_chinese = [p.page_content for p in pages_chinese]\n", "joined_page_text_chinese = \" \".join(all_page_text_chinese)\n", "\n", "# \u5206\u5272\u6587\u672c\n", "text_splitter_chinese = RecursiveCharacterTextSplitter(chunk_size = 1500,chunk_overlap = 150)\n", "splits_chinese = text_splitter_chinese.split_text(joined_page_text_chinese)"]}, {"cell_type": "code", "execution_count": 64, "id": "59abbaff", "metadata": {}, "outputs": [], "source": ["# \u68c0\u7d22\n", "svm_retriever = SVMRetriever.from_texts(splits, embedding)\n", "tfidf_retriever = TFIDFRetriever.from_texts(splits)"]}, {"cell_type": "code", "execution_count": 66, "id": "7885389e", "metadata": {}, "outputs": [{"data": {"text/plain": ["Document(page_content=\"let me just check what questions you have righ t now. So if there are no questions, I'll just \\nclose with two reminders, which are after class today or as you start to talk with other \\npeople in this class, I just encourage you again to start to form project partners, to try to \\nfind project partners to do your project with. And also, this is a good time to start forming \\nstudy groups, so either talk to your friends or post in the newsgroup, but we just \\nencourage you to try to star t to do both of those today, okay? Form study groups, and try \\nto find two other project partners. \\nSo thank you. I'm looking forward to teaching this class, and I'll see you in a couple of \\ndays. [End of Audio] \\nDuration: 69 minutes\", metadata={})"]}, "execution_count": 66, "metadata": {}, "output_type": "execute_result"}], "source": ["question = \"What are major topics for this class?\" # \u8fd9\u95e8\u8bfe\u7684\u4e3b\u8981\u4e3b\u9898\u662f\u4ec0\u4e48\uff1f\n", "docs_svm = svm_retriever.get_relevant_documents(question)\n", "docs_svm[0]"]}, {"cell_type": "code", "execution_count": 67, "id": "2a1659c0", "metadata": {"tags": []}, "outputs": [{"data": {"text/plain": ["Document(page_content=\"Saxena and Min Sun here did, wh ich is given an image like this, right? This is actually a \\npicture taken of the Stanford campus. You can apply that sort of cl ustering algorithm and \\ngroup the picture into regions. Let me actually blow that up so that you can see it more \\nclearly. Okay. So in the middle, you see the lines sort of groupi ng the image together, \\ngrouping the image into [inaudible] regions. \\nAnd what Ashutosh and Min did was they then applied the learning algorithm to say can \\nwe take this clustering and us e it to build a 3D model of the world? And so using the \\nclustering, they then had a lear ning algorithm try to learn what the 3D structure of the \\nworld looks like so that they could come up with a 3D model that you can sort of fly \\nthrough, okay? Although many people used to th ink it's not possible to take a single \\nimage and build a 3D model, but using a lear ning algorithm and that sort of clustering \\nalgorithm is the first step. They were able to. \\nI'll just show you one more example. I like this because it's a picture of Stanford with our \\nbeautiful Stanford campus. So again, taking th e same sort of clustering algorithms, taking \\nthe same sort of unsupervised learning algor ithm, you can group the pixels into different \\nregions. And using that as a pre-processing step, they eventually built this sort of 3D model of Stanford campus in a single picture. You can sort of walk into the ceiling, look\", metadata={})"]}, "execution_count": 67, "metadata": {}, "output_type": "execute_result"}], "source": ["question = \"what did they say about matlab?\" # \u4ed6\u4eec\u5173\u4e8eMatlab\u8bf4\u4e86\u4e9b\u4ec0\u4e48\uff1f\n", "docs_tfidf = tfidf_retriever.get_relevant_documents(question)\n", "docs_tfidf[0]"]}, {"cell_type": "code", "execution_count": 78, "id": "5ade9c87", "metadata": {}, "outputs": [], "source": ["# \u4e2d\u6587\u7248\n", "svm_retriever_chinese = SVMRetriever.from_texts(splits_chinese, embedding)\n", "tfidf_retriever_chinese = TFIDFRetriever.from_texts(splits_chinese)"]}, {"cell_type": "code", "execution_count": 79, "id": "cc823bea", "metadata": {}, "outputs": [{"data": {"text/plain": ["Document(page_content='fig, ax = plt.subplots() \\n# step4 \u7ed8\u5236\u56fe\u50cf\uff0c \u8fd9\u2f00\u6a21\u5757\u7684\u6269\u5c55\u53c2\u8003\u7b2c\u2f06\u7ae0\u8fdb\u2f00\u6b65\u5b66\u4e60\\nax.plot(x, y, label=\\'linear\\') \\n# step5 \u6dfb\u52a0\u6807\u7b7e\uff0c\u2f42\u5b57\u548c\u56fe\u4f8b\uff0c\u8fd9\u2f00\u6a21\u5757\u7684\u6269\u5c55\u53c2\u8003\u7b2c\u56db\u7ae0\u8fdb\u2f00\u6b65\u5b66\u4e60\\nax.set_xlabel(\\'x label\\') \\nax.set_ylabel(\\'y label\\') \\nax.set_title(\"Simple Plot\") \\nax.legend() ;\\n\u601d\u8003\u9898\\n\u8bf7\u601d\u8003\u4e24\u79cd\u7ed8\u56fe\u6a21\u5f0f\u7684\u4f18\u7f3a\u70b9\u548c\u5404\u2f83\u9002\u5408\u7684\u4f7f\u2f64\u573a\u666f\\n\u5728\u7b2c\u4e94\u8282\u7ed8\u56fe\u6a21\u677f\u4e2d\u6211\u4eec\u662f\u4ee5 OO \u6a21\u5f0f\u4f5c\u4e3a\u4f8b\u2f26\u5c55\u793a\u7684\uff0c\u8bf7\u601d\u8003\u5e76\u5199\u2f00\u4e2a pyplot \u7ed8\u56fe\u6a21\u5f0f\u7684\u7b80\u5355\u6a21\u677f', metadata={})"]}, "execution_count": 79, "metadata": {}, "output_type": "execute_result"}], "source": ["question_chinese = \"\u8fd9\u95e8\u8bfe\u7684\u4e3b\u8981\u4e3b\u9898\u662f\u4ec0\u4e48\uff1f\" \n", "docs_svm_chinese = svm_retriever_chinese.get_relevant_documents(question_chinese)\n", "docs_svm_chinese[0]"]}, {"cell_type": "code", "execution_count": 80, "id": "01eb9d43", "metadata": {}, "outputs": [{"data": {"text/plain": ["Document(page_content='fig, ax = plt.subplots() \\n# step4 \u7ed8\u5236\u56fe\u50cf\uff0c \u8fd9\u2f00\u6a21\u5757\u7684\u6269\u5c55\u53c2\u8003\u7b2c\u2f06\u7ae0\u8fdb\u2f00\u6b65\u5b66\u4e60\\nax.plot(x, y, label=\\'linear\\') \\n# step5 \u6dfb\u52a0\u6807\u7b7e\uff0c\u2f42\u5b57\u548c\u56fe\u4f8b\uff0c\u8fd9\u2f00\u6a21\u5757\u7684\u6269\u5c55\u53c2\u8003\u7b2c\u56db\u7ae0\u8fdb\u2f00\u6b65\u5b66\u4e60\\nax.set_xlabel(\\'x label\\') \\nax.set_ylabel(\\'y label\\') \\nax.set_title(\"Simple Plot\") \\nax.legend() ;\\n\u601d\u8003\u9898\\n\u8bf7\u601d\u8003\u4e24\u79cd\u7ed8\u56fe\u6a21\u5f0f\u7684\u4f18\u7f3a\u70b9\u548c\u5404\u2f83\u9002\u5408\u7684\u4f7f\u2f64\u573a\u666f\\n\u5728\u7b2c\u4e94\u8282\u7ed8\u56fe\u6a21\u677f\u4e2d\u6211\u4eec\u662f\u4ee5 OO \u6a21\u5f0f\u4f5c\u4e3a\u4f8b\u2f26\u5c55\u793a\u7684\uff0c\u8bf7\u601d\u8003\u5e76\u5199\u2f00\u4e2a pyplot \u7ed8\u56fe\u6a21\u5f0f\u7684\u7b80\u5355\u6a21\u677f', metadata={})"]}, "execution_count": 80, "metadata": {}, "output_type": "execute_result"}], "source": ["question_chinese = \"Matplotlib\u662f\u4ec0\u4e48\uff1f\"\n", "docs_tfidf_chinese = tfidf_retriever_chinese.get_relevant_documents(question_chinese)\n", "docs_tfidf_chinese[0]"]}], "metadata": {"kernelspec": {"display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11"}}, "nbformat": 4, "nbformat_minor": 5}
\ No newline at end of file
diff --git "a/content/\345\277\205\344\277\256\345\233\233-LangChain Chat with Your Data/6.\351\227\256\347\255\224 Question Answering.ipynb" "b/content/\345\277\205\344\277\256\345\233\233-LangChain Chat with Your Data/6.\351\227\256\347\255\224 Question Answering.ipynb"
index a7b8da7e..92b6812c 100644
--- "a/content/\345\277\205\344\277\256\345\233\233-LangChain Chat with Your Data/6.\351\227\256\347\255\224 Question Answering.ipynb"
+++ "b/content/\345\277\205\344\277\256\345\233\233-LangChain Chat with Your Data/6.\351\227\256\347\255\224 Question Answering.ipynb"
@@ -1 +1 @@
-{"cells":[{"cell_type":"markdown","metadata":{},"source":["# 第六章 问答\n","\n"," - [一、引言](#一、引言)\n"," - [二、环境配置](#二、环境配置)\n"," - [三、加载向量数据库](#三、加载向量数据库)\n"," - [四、构造检索式问答连](#四、构造检索式问答连)\n"," - [五、深入探究检索式问答链](#五、深入探究检索式问答链)\n"," - [5.1 基于模板的检索式问答链](#5.1-基于模板的检索式问答链)\n"," - [5.2 基于 MapReduce 的检索式问答链](#5.2-基于-MapReduce-的检索式问答链)\n"," - [5.3 基于 Refine 的检索式问答链](#5.3-基于-Refine-的检索式问答链)\n"," - [六、实验:状态记录](#六、实验:状态记录)\n"]},{"cell_type":"markdown","metadata":{},"source":["## 一、引言\n","\n","\n","在上一章,我们已经讨论了如何检索与给定问题相关的文档。下一步是获取这些文档,拿到原始问题,将它们一起传递给语言模型,并要求它回答这个问题。在本课程中,我们将详细介绍这一过程,以及完成这项任务的几种不同方法。"]},{"cell_type":"markdown","metadata":{},"source":["我们已经完成了整个存储和获取,获取了相关的切分文档之后,现在我们需要将它们传递给语言模型,以获得答案。这个过程的一般流程如下:首先问题被提出,然后我们查找相关的文档,接着将这些切分文档和系统提示一起传递给语言模型,并获得答案。\n","\n","默认情况下,我们将所有的文档切片都传递到同一个上下文窗口中,即同一次语言模型调用中。然而,有一些不同的方法可以解决这个问题,它们都有优缺点。大部分优点来自于有时可能会有很多文档,但你简单地无法将它们全部传递到同一个上下文窗口中。MapReduce、Refine 和 MapRerank 是三种方法,用于解决这个短上下文窗口的问题。我们将在该课程中进行简要介绍。"]},{"cell_type":"markdown","metadata":{},"source":["## 二、环境配置"]},{"cell_type":"markdown","metadata":{},"source":["配置环境方法同前,此处不再赘述"]},{"cell_type":"code","execution_count":1,"metadata":{},"outputs":[],"source":["import os\n","import openai\n","import sys\n","sys.path.append('../..')\n","\n","from dotenv import load_dotenv, find_dotenv\n","\n","_ = load_dotenv(find_dotenv()) # read local .env file\n","\n","openai.api_key = os.environ['OPENAI_API_KEY']\n"]},{"cell_type":"markdown","metadata":{},"source":["在2023年9月2日之后,GPT-3.5 API 会进行更新,因此此处需要进行一个时间判断"]},{"cell_type":"code","execution_count":2,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["gpt-3.5-turbo\n"]}],"source":["import datetime\n","current_date = datetime.datetime.now().date()\n","if current_date < datetime.date(2023, 9, 2):\n"," llm_name = \"gpt-3.5-turbo-0301\"\n","else:\n"," llm_name = \"gpt-3.5-turbo\"\n","print(llm_name)"]},{"cell_type":"markdown","metadata":{},"source":["## 三、加载向量数据库"]},{"cell_type":"code","execution_count":3,"metadata":{},"outputs":[],"source":["# 加载在之前已经进行持久化的向量数据库\n","from langchain.vectorstores import Chroma\n","from langchain.embeddings.openai import OpenAIEmbeddings\n","persist_directory = 'docs/chroma/cs229_lectures/'\n","embedding = OpenAIEmbeddings(model='text-embedding-3-small')\n","vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)"]},{"cell_type":"code","execution_count":4,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["209\n"]}],"source":["# 可以看见包含了我们之前进行分割的209个文档\n","print(vectordb._collection.count())"]},{"cell_type":"markdown","metadata":{},"source":["我们可以测试一下对于一个提问进行向量检索。如下代码会在向量数据库中根据相似性进行检索,返回给你 k 个文档。"]},{"cell_type":"code","execution_count":5,"metadata":{},"outputs":[{"data":{"text/plain":["3"]},"execution_count":5,"metadata":{},"output_type":"execute_result"}],"source":["question = \"What are major topics for this class?\"\n","docs = vectordb.similarity_search(question,k=3)\n","len(docs)"]},{"cell_type":"code","execution_count":6,"metadata":{},"outputs":[{"data":{"text/plain":["3"]},"execution_count":6,"metadata":{},"output_type":"execute_result"}],"source":["question = \"这节课的主要话题是什么\"\n","docs = vectordb.similarity_search(question,k=3)\n","len(docs)"]},{"cell_type":"markdown","metadata":{},"source":["## 四、构造检索式问答连"]},{"cell_type":"markdown","metadata":{},"source":["基于 LangChain,我们可以构造一个使用 GPT3.5 进行问答的检索式问答链,这是一种通过检索步骤进行问答的方法。我们可以通过传入一个语言模型和一个向量数据库来创建它作为检索器。然后,我们可以用问题作为查询调用它,得到一个答案。"]},{"cell_type":"code","execution_count":7,"metadata":{},"outputs":[],"source":["# 使用 ChatGPT3.5,温度设置为0\n","from langchain_community.chat_models import ChatOpenAI\n","llm = ChatOpenAI(model_name=llm_name, temperature=0)"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[],"source":["# 导入检索式问答链\n","from langchain.chains.retrieval_qa.base import RetrievalQA"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[],"source":["# 声明一个检索式问答链\n","qa_chain = RetrievalQA.from_chain_type(\n"," llm,\n"," retriever=vectordb.as_retriever()\n",")"]},{"cell_type":"code","execution_count":10,"metadata":{},"outputs":[],"source":["# 可以以该方式进行检索问答\n","question = \"What are major topics for this class?\"\n","result = qa_chain({\"query\": question})"]},{"cell_type":"code","execution_count":11,"metadata":{},"outputs":[{"data":{"text/plain":["'The major topics for this class seem to include machine learning, statistics, and algebra. Additionally, there will be discussions on extensions of the material covered in the main lectures.'"]},"execution_count":11,"metadata":{},"output_type":"execute_result"}],"source":["result[\"result\"]"]},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[],"source":["# 可以以该方式进行检索问答\n","question = \"这节课的主要话题是什么\"\n","result = qa_chain({\"query\": question})"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["这节课的主要话题是机器学习。\n"]}],"source":["print(result[\"result\"])"]},{"cell_type":"markdown","metadata":{},"source":["## 五、深入探究检索式问答链"]},{"cell_type":"markdown","metadata":{},"source":["通过上述代码,我们可以实现一个简单的检索式问答链。接下来,让我们深入其中的细节,看看在这个检索式问答链中,LangChain 都做了些什么。\n"]},{"cell_type":"markdown","metadata":{},"source":["### 5.1 基于模板的检索式问答链"]},{"cell_type":"markdown","metadata":{},"source":["\n","我们首先定义了一个提示模板。它包含一些关于如何使用下面的上下文片段的说明,然后有一个上下文变量的占位符。"]},{"cell_type":"code","execution_count":14,"metadata":{},"outputs":[],"source":["from langchain.prompts import PromptTemplate\n","\n","# Build prompt\n","template = \"\"\"Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say \"thanks for asking!\" at the end of the answer. \n","{context}\n","Question: {question}\n","Helpful Answer:\"\"\"\n","QA_CHAIN_PROMPT = PromptTemplate.from_template(template)\n"]},{"cell_type":"code","execution_count":15,"metadata":{},"outputs":[],"source":["# 中文版\n","from langchain.prompts import PromptTemplate\n","\n","# Build prompt\n","template = \"\"\"使用以下上下文片段来回答最后的问题。如果你不知道答案,只需说不知道,不要试图编造答案。答案最多使用三个句子。尽量简明扼要地回答。在回答的最后一定要说\"感谢您的提问!\"\n","{context}\n","问题:{question}\n","有用的回答:\"\"\"\n","QA_CHAIN_PROMPT = PromptTemplate.from_template(template)\n"]},{"cell_type":"code","execution_count":16,"metadata":{},"outputs":[],"source":["# Run chain\n","qa_chain = RetrievalQA.from_chain_type(\n"," llm,\n"," retriever=vectordb.as_retriever(),\n"," return_source_documents=True,\n"," chain_type_kwargs={\"prompt\": QA_CHAIN_PROMPT}\n",")"]},{"cell_type":"code","execution_count":17,"metadata":{},"outputs":[],"source":["question = \"Is probability a class topic?\""]},{"cell_type":"code","execution_count":18,"metadata":{},"outputs":[],"source":["result = qa_chain({\"query\": question})"]},{"cell_type":"code","execution_count":19,"metadata":{},"outputs":[{"data":{"text/plain":["'Yes, probability is a class topic as the instructor assumes familiarity with basic probability and statistics. They will also use a probabilistic interpretation to derive the next learning algorithm. However, the class will not be very programming intensive. If you have not seen probability or statistics for a while, they will go over it in the discussion sections.感谢您的提问!'"]},"execution_count":19,"metadata":{},"output_type":"execute_result"}],"source":["result[\"result\"]"]},{"cell_type":"code","execution_count":20,"metadata":{},"outputs":[],"source":["# 中文版\n","question = \"机器学习是其中一节的话题吗\""]},{"cell_type":"code","execution_count":21,"metadata":{},"outputs":[],"source":["result = qa_chain({\"query\": question})"]},{"cell_type":"code","execution_count":22,"metadata":{},"outputs":[{"data":{"text/plain":["'是的,机器学习是这门课程的一个主题。感谢您的提问!'"]},"execution_count":22,"metadata":{},"output_type":"execute_result"}],"source":["result[\"result\"]"]},{"cell_type":"code","execution_count":23,"metadata":{},"outputs":[{"data":{"text/plain":["Document(page_content=\"a machine learning class, right? If you go to a carpentry school, they can give you the \\ntools of carpentry. They'll give you a hamme r, a bunch of nails, a screwdriver or \\nwhatever. But a master carpenter will be able to use those tools far better than most of us \\nin this room. I know a carpen ter can do things with a hammer and nail that I couldn't \\npossibly. And it's actually a littl e bit like that in machine learning, too. One thing that's \\nsadly not taught in many courses on machine l earning is how to take the tools of machine \\nlearning and really, really apply them well. \\nSo in the same way, so the tools of machin e learning are I wanna say quite a bit more \\nadvanced than the tools of carpentry. Maybe a carpenter will disagree . But a large part of \\nthis class will be just givi ng you the raw tools of machine learning, just the algorithms \\nand so on. But what I plan to do throughout this entire quarter, not just in the segment of \\nlearning theory, but actually as a theme r unning through everything I do this quarter, will \\nbe to try to convey to you the skills to real ly take the learning al gorithm ideas and really \\nto get them to work on a problem. \\nIt's sort of hard for me to stand here and say how big a deal that is, but when I walk \\naround companies in Silicon Valley, it's co mpletely not uncommon for me to see \\nsomeone using some machine learning algorith m and then explain to me what they've\", metadata={'page': 14, 'source': 'docs/cs229_lectures/MachineLearning-Lecture01.pdf'})"]},"execution_count":23,"metadata":{},"output_type":"execute_result"}],"source":["result[\"source_documents\"][0]"]},{"cell_type":"markdown","metadata":{},"source":["这种方法非常好,因为它只涉及对语言模型的一次调用。然而,它也有局限性,即如果文档太多,可能无法将它们全部适配到上下文窗口中。我们可以使用另一种技术来对文档进行问答,即MapReduce技术。"]},{"cell_type":"markdown","metadata":{},"source":["### 5.2 基于 MapReduce 的检索式问答链"]},{"cell_type":"markdown","metadata":{},"source":["在 MapReduce 技术中,首先将每个独立的文档单独发送到语言模型以获取原始答案。然后,这些答案通过最终对语言模型的一次调用组合成最终的答案。虽然这样涉及了更多对语言模型的调用,但它的优势在于可以处理任意数量的文档。\n"]},{"cell_type":"code","execution_count":24,"metadata":{},"outputs":[],"source":["qa_chain_mr = RetrievalQA.from_chain_type(\n"," llm,\n"," retriever=vectordb.as_retriever(),\n"," chain_type=\"map_reduce\"\n",")"]},{"cell_type":"code","execution_count":25,"metadata":{},"outputs":[],"source":["question = \"Is probability a class topic?\"\n","result = qa_chain_mr({\"query\": question})"]},{"cell_type":"code","execution_count":26,"metadata":{},"outputs":[{"data":{"text/plain":["'Yes, probability is a class topic mentioned in the text. The instructor assumes familiarity with basic probability and statistics, stating that most undergraduate statistics classes, like Stat 116 taught at Stanford, will be more than enough. The text also mentions going over some prerequisites, including probability, in the discussion sections as a refresher course.'"]},"execution_count":26,"metadata":{},"output_type":"execute_result"}],"source":["result[\"result\"]"]},{"cell_type":"code","execution_count":27,"metadata":{},"outputs":[{"data":{"text/plain":["'不,这段文本没有提到概率论是其中一节的话题。'"]},"execution_count":27,"metadata":{},"output_type":"execute_result"}],"source":["qa_chain_mr = RetrievalQA.from_chain_type(\n"," llm,\n"," retriever=vectordb.as_retriever(),\n"," chain_type=\"map_reduce\"\n",")\n","# 中文版\n","question = \"概率论是其中一节的话题吗\"\n","result = qa_chain_mr({\"query\": question})\n","result[\"result\"]"]},{"cell_type":"markdown","metadata":{},"source":["当我们将之前的问题通过这个链进行运行时,我们可以看到这种方法的两个问题。第一,速度要慢得多。第二,结果实际上更差。根据给定文档的这一部分,对这个问题并没有明确的答案。这可能是因为它是基于每个文档单独回答的。因此,如果信息分布在两个文档之间,它并没有在同一上下文中获取到所有的信息。"]},{"cell_type":"code","execution_count":28,"metadata":{},"outputs":[],"source":["#import os\n","#os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n","#os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.langchain.plus\"\n","#os.environ[\"LANGCHAIN_API_KEY\"] = \"...\" # replace dots with your api key"]},{"cell_type":"markdown","metadata":{},"source":["我们可导入上述环境变量,然后探寻 MapReduce 文档链的细节。例如,上述演示中,我们实际上涉及了四个单独的对语言模型的调用。在运行完每个文档后,它们会在最终链式中组合在一起,即Stuffed Documents链,将所有这些回答合并到最终的调用中。"]},{"cell_type":"markdown","metadata":{},"source":["### 5.3 基于 Refine 的检索式问答链"]},{"cell_type":"markdown","metadata":{},"source":["我们可以类似地设置链式类型为Refine。这是一种新的链式类型。Refine 文档链类似于 MapReduce 链,对于每一个文档,会调用一次 LLM,但有所改进的是,我们每次发送给 LLM 的最终提示是一个序列,这个序列会将先前的响应与新数据结合在一起,并请求得到改进后的响应。因此,这是一种类似于 RNN 的概念,我们增强了上下文,从而解决信息分布在不同文档的问题。"]},{"cell_type":"code","execution_count":29,"metadata":{},"outputs":[{"data":{"text/plain":["'Based on the additional context provided, it seems that probability may indeed be a class topic within the broader subject of machine learning. The discussion sections will be used to go over extensions for the material taught in the main lectures, and probability could be one of the topics covered in these extensions. Therefore, probability could potentially be a class topic within the context of machine learning.'"]},"execution_count":29,"metadata":{},"output_type":"execute_result"}],"source":["qa_chain_mr = RetrievalQA.from_chain_type(\n"," llm,\n"," retriever=vectordb.as_retriever(),\n"," chain_type=\"refine\"\n",")\n","question = \"Is probability a class topic?\"\n","result = qa_chain_mr({\"query\": question})\n","result[\"result\"]"]},{"cell_type":"code","execution_count":30,"metadata":{},"outputs":[{"data":{"text/plain":["'根据提供的更多上下文,这节课虽然主要内容是机器学习,但在讨论环节中可能会涉及统计学或代数等主题作为复习。概率论可能会在讨论环节中作为复习或扩展内容的一部分,因为课程假设学生对基本概率和统计学有一定了解。在讨论环节中,可能会涉及概率论的基本概念,如随机变量、期望、方差等。因此,概率论可能会在课程中作为复习或扩展内容的一部分。在讨论环节中,还可能会涉及基本的线性代数知识,如矩阵、向量、矩阵乘法、矩阵求逆等。因此,概率论和线性代数可能会在课程中作为复习或扩展内容的一部分。'"]},"execution_count":30,"metadata":{},"output_type":"execute_result"}],"source":["qa_chain_mr = RetrievalQA.from_chain_type(\n"," llm,\n"," retriever=vectordb.as_retriever(),\n"," chain_type=\"refine\"\n",")\n","question = \"概率论是其中一节的话题吗\"\n","result = qa_chain_mr({\"query\": question})\n","result[\"result\"]"]},{"cell_type":"markdown","metadata":{},"source":["你会注意到,这个结果比MapReduce链的结果要好。这是因为使用Refined Chain允许你逐个地组合信息,实际上比MapReduce链鼓励更多的信息传递。"]},{"cell_type":"markdown","metadata":{},"source":["## 六、实验:状态记录"]},{"cell_type":"markdown","metadata":{},"source":["让我们在这里做一个实验。\n","\n","我们将创建一个QA链,使用默认的stuff。让我们问一个问题,概率论是课程的主题吗?它会回答,概率论应该是先决条件。我们将追问,为什么需要这些先决条件?然后我们得到了一个答案。这门课的先决条件是假定具有计算机科学和基本计算机技能和原理的基本知识。这与之前问有关概率的问题毫不相关。"]},{"cell_type":"code","execution_count":31,"metadata":{},"outputs":[],"source":["qa_chain = RetrievalQA.from_chain_type(\n"," llm,\n"," retriever=vectordb.as_retriever()\n",")"]},{"cell_type":"code","execution_count":32,"metadata":{},"outputs":[{"data":{"text/plain":["'Yes, probability is a class topic mentioned in the context provided. The instructor assumes familiarity with basic probability and statistics for the class.'"]},"execution_count":32,"metadata":{},"output_type":"execute_result"}],"source":["question = \"Is probability a class topic?\"\n","result = qa_chain({\"query\": question})\n","result[\"result\"]"]},{"cell_type":"code","execution_count":33,"metadata":{},"outputs":[{"data":{"text/plain":["'The prerequisites mentioned in the context are needed because the course assumes familiarity with basic concepts in probability and statistics, as well as basic linear algebra. Understanding these foundational concepts is crucial for grasping the more advanced topics covered in the machine learning course. Additionally, knowledge of computer science principles and skills like big-O notation is necessary for implementing and understanding algorithms in machine learning.'"]},"execution_count":33,"metadata":{},"output_type":"execute_result"}],"source":["question = \"why are those prerequesites needed?\"\n","result = qa_chain({\"query\": question})\n","result[\"result\"]"]},{"cell_type":"code","execution_count":34,"metadata":{},"outputs":[{"data":{"text/plain":["'是的,概率论是这门课程的一个内容。老师在课程介绍中提到了对基本概率和统计知识的熟悉是必要的,因此会涉及概率论的内容。'"]},"execution_count":34,"metadata":{},"output_type":"execute_result"}],"source":["question = \"概率论是这节课的一个内容吗\"\n","result = qa_chain({\"query\": question})\n","result[\"result\"]"]},{"cell_type":"code","execution_count":35,"metadata":{},"outputs":[{"data":{"text/plain":["'这些知识对于理解和应用机器学习算法至关重要,因为机器学习在科学和工业的许多领域都有着巨大的影响。掌握这些知识可以帮助你解决感兴趣的问题,并为未来的研究和工作做好准备。'"]},"execution_count":35,"metadata":{},"output_type":"execute_result"}],"source":["question = \"为什么需要具备这些知识\"\n","result = qa_chain({\"query\": question})\n","result[\"result\"]"]},{"cell_type":"markdown","metadata":{},"source":["基本上,我们使用的链式(chain)没有任何状态的概念。它不记得之前的问题或之前的答案。为了实现这一点,我们需要引入内存,这是我们将在下一节中讨论的内容。"]}],"metadata":{"kernelspec":{"display_name":"gpt","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.9"},"orig_nbformat":4},"nbformat":4,"nbformat_minor":2}
+{"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["# \u7b2c\u516d\u7ae0 \u95ee\u7b54\n", "\n", " - [\u4e00\u3001\u5f15\u8a00](#\u4e00\u3001\u5f15\u8a00)\n", " - [\u4e8c\u3001\u73af\u5883\u914d\u7f6e](#\u4e8c\u3001\u73af\u5883\u914d\u7f6e)\n", " - [\u4e09\u3001\u52a0\u8f7d\u5411\u91cf\u6570\u636e\u5e93](#\u4e09\u3001\u52a0\u8f7d\u5411\u91cf\u6570\u636e\u5e93)\n", " - [\u56db\u3001\u6784\u9020\u68c0\u7d22\u5f0f\u95ee\u7b54\u8fde](#\u56db\u3001\u6784\u9020\u68c0\u7d22\u5f0f\u95ee\u7b54\u8fde)\n", " - [\u4e94\u3001\u6df1\u5165\u63a2\u7a76\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe](#\u4e94\u3001\u6df1\u5165\u63a2\u7a76\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe)\n", " - [5.1 \u57fa\u4e8e\u6a21\u677f\u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe](#5.1-\u57fa\u4e8e\u6a21\u677f\u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe)\n", " - [5.2 \u57fa\u4e8e MapReduce \u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe](#5.2-\u57fa\u4e8e-MapReduce-\u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe)\n", " - [5.3 \u57fa\u4e8e Refine \u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe](#5.3-\u57fa\u4e8e-Refine-\u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe)\n", " - [\u516d\u3001\u5b9e\u9a8c\uff1a\u72b6\u6001\u8bb0\u5f55](#\u516d\u3001\u5b9e\u9a8c\uff1a\u72b6\u6001\u8bb0\u5f55)\n"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## \u4e00\u3001\u5f15\u8a00\n", "\n", "\n", "\u5728\u4e0a\u4e00\u7ae0\uff0c\u6211\u4eec\u5df2\u7ecf\u8ba8\u8bba\u4e86\u5982\u4f55\u68c0\u7d22\u4e0e\u7ed9\u5b9a\u95ee\u9898\u76f8\u5173\u7684\u6587\u6863\u3002\u4e0b\u4e00\u6b65\u662f\u83b7\u53d6\u8fd9\u4e9b\u6587\u6863\uff0c\u62ff\u5230\u539f\u59cb\u95ee\u9898\uff0c\u5c06\u5b83\u4eec\u4e00\u8d77\u4f20\u9012\u7ed9\u8bed\u8a00\u6a21\u578b\uff0c\u5e76\u8981\u6c42\u5b83\u56de\u7b54\u8fd9\u4e2a\u95ee\u9898\u3002\u5728\u672c\u8bfe\u7a0b\u4e2d\uff0c\u6211\u4eec\u5c06\u8be6\u7ec6\u4ecb\u7ecd\u8fd9\u4e00\u8fc7\u7a0b\uff0c\u4ee5\u53ca\u5b8c\u6210\u8fd9\u9879\u4efb\u52a1\u7684\u51e0\u79cd\u4e0d\u540c\u65b9\u6cd5\u3002"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u6211\u4eec\u5df2\u7ecf\u5b8c\u6210\u4e86\u6574\u4e2a\u5b58\u50a8\u548c\u83b7\u53d6\uff0c\u83b7\u53d6\u4e86\u76f8\u5173\u7684\u5207\u5206\u6587\u6863\u4e4b\u540e\uff0c\u73b0\u5728\u6211\u4eec\u9700\u8981\u5c06\u5b83\u4eec\u4f20\u9012\u7ed9\u8bed\u8a00\u6a21\u578b\uff0c\u4ee5\u83b7\u5f97\u7b54\u6848\u3002\u8fd9\u4e2a\u8fc7\u7a0b\u7684\u4e00\u822c\u6d41\u7a0b\u5982\u4e0b\uff1a\u9996\u5148\u95ee\u9898\u88ab\u63d0\u51fa\uff0c\u7136\u540e\u6211\u4eec\u67e5\u627e\u76f8\u5173\u7684\u6587\u6863\uff0c\u63a5\u7740\u5c06\u8fd9\u4e9b\u5207\u5206\u6587\u6863\u548c\u7cfb\u7edf\u63d0\u793a\u4e00\u8d77\u4f20\u9012\u7ed9\u8bed\u8a00\u6a21\u578b\uff0c\u5e76\u83b7\u5f97\u7b54\u6848\u3002\n", "\n", "\u9ed8\u8ba4\u60c5\u51b5\u4e0b\uff0c\u6211\u4eec\u5c06\u6240\u6709\u7684\u6587\u6863\u5207\u7247\u90fd\u4f20\u9012\u5230\u540c\u4e00\u4e2a\u4e0a\u4e0b\u6587\u7a97\u53e3\u4e2d\uff0c\u5373\u540c\u4e00\u6b21\u8bed\u8a00\u6a21\u578b\u8c03\u7528\u4e2d\u3002\u7136\u800c\uff0c\u6709\u4e00\u4e9b\u4e0d\u540c\u7684\u65b9\u6cd5\u53ef\u4ee5\u89e3\u51b3\u8fd9\u4e2a\u95ee\u9898\uff0c\u5b83\u4eec\u90fd\u6709\u4f18\u7f3a\u70b9\u3002\u5927\u90e8\u5206\u4f18\u70b9\u6765\u81ea\u4e8e\u6709\u65f6\u53ef\u80fd\u4f1a\u6709\u5f88\u591a\u6587\u6863\uff0c\u4f46\u4f60\u7b80\u5355\u5730\u65e0\u6cd5\u5c06\u5b83\u4eec\u5168\u90e8\u4f20\u9012\u5230\u540c\u4e00\u4e2a\u4e0a\u4e0b\u6587\u7a97\u53e3\u4e2d\u3002MapReduce\u3001Refine \u548c MapRerank \u662f\u4e09\u79cd\u65b9\u6cd5\uff0c\u7528\u4e8e\u89e3\u51b3\u8fd9\u4e2a\u77ed\u4e0a\u4e0b\u6587\u7a97\u53e3\u7684\u95ee\u9898\u3002\u6211\u4eec\u5c06\u5728\u8be5\u8bfe\u7a0b\u4e2d\u8fdb\u884c\u7b80\u8981\u4ecb\u7ecd\u3002"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## \u4e8c\u3001\u73af\u5883\u914d\u7f6e"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u914d\u7f6e\u73af\u5883\u65b9\u6cd5\u540c\u524d\uff0c\u6b64\u5904\u4e0d\u518d\u8d58\u8ff0"]}, {"cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": ["import os\n", "import openai\n", "import sys\n", "sys.path.append('../..')\n", "\n", "from dotenv import load_dotenv, find_dotenv\n", "\n", "_ = load_dotenv(find_dotenv()) # read local .env file\n", "\n", "openai.api_key = os.environ['OPENAI_API_KEY']\n"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u57282023\u5e749\u67082\u65e5\u4e4b\u540e\uff0cGPT-3.5 API \u4f1a\u8fdb\u884c\u66f4\u65b0\uff0c\u56e0\u6b64\u6b64\u5904\u9700\u8981\u8fdb\u884c\u4e00\u4e2a\u65f6\u95f4\u5224\u65ad"]}, {"cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["gpt-3.5-turbo-0301\n"]}], "source": ["import datetime\n", "current_date = datetime.datetime.now().date()\n", "if current_date < datetime.date(2023, 9, 2):\n", " llm_name = \"gpt-3.5-turbo-0301\"\n", "else:\n", " llm_name = \"gpt-3.5-turbo\"\n", "print(llm_name)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## \u4e09\u3001\u52a0\u8f7d\u5411\u91cf\u6570\u636e\u5e93"]}, {"cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": ["# \u52a0\u8f7d\u5728\u4e4b\u524d\u5df2\u7ecf\u8fdb\u884c\u6301\u4e45\u5316\u7684\u5411\u91cf\u6570\u636e\u5e93\n", "from langchain.vectorstores import Chroma\n", "from langchain.embeddings.openai import OpenAIEmbeddings\n", "persist_directory = 'docs/chroma/cs229_lectures/'\n", "embedding = OpenAIEmbeddings()\n", "vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)"]}, {"cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["209\n"]}], "source": ["# \u53ef\u4ee5\u770b\u89c1\u5305\u542b\u4e86\u6211\u4eec\u4e4b\u524d\u8fdb\u884c\u5206\u5272\u7684209\u4e2a\u6587\u6863\n", "print(vectordb._collection.count())"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u6211\u4eec\u53ef\u4ee5\u6d4b\u8bd5\u4e00\u4e0b\u5bf9\u4e8e\u4e00\u4e2a\u63d0\u95ee\u8fdb\u884c\u5411\u91cf\u68c0\u7d22\u3002\u5982\u4e0b\u4ee3\u7801\u4f1a\u5728\u5411\u91cf\u6570\u636e\u5e93\u4e2d\u6839\u636e\u76f8\u4f3c\u6027\u8fdb\u884c\u68c0\u7d22\uff0c\u8fd4\u56de\u7ed9\u4f60 k \u4e2a\u6587\u6863\u3002"]}, {"cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [{"data": {"text/plain": ["3"]}, "execution_count": 34, "metadata": {}, "output_type": "execute_result"}], "source": ["question = \"What are major topics for this class?\"\n", "docs = vectordb.similarity_search(question,k=3)\n", "len(docs)"]}, {"cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [{"data": {"text/plain": ["3"]}, "execution_count": 35, "metadata": {}, "output_type": "execute_result"}], "source": ["question = \"\u8fd9\u8282\u8bfe\u7684\u4e3b\u8981\u8bdd\u9898\u662f\u4ec0\u4e48\"\n", "docs = vectordb.similarity_search(question,k=3)\n", "len(docs)"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## \u56db\u3001\u6784\u9020\u68c0\u7d22\u5f0f\u95ee\u7b54\u8fde"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u57fa\u4e8e LangChain\uff0c\u6211\u4eec\u53ef\u4ee5\u6784\u9020\u4e00\u4e2a\u4f7f\u7528 GPT3.5 \u8fdb\u884c\u95ee\u7b54\u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe\uff0c\u8fd9\u662f\u4e00\u79cd\u901a\u8fc7\u68c0\u7d22\u6b65\u9aa4\u8fdb\u884c\u95ee\u7b54\u7684\u65b9\u6cd5\u3002\u6211\u4eec\u53ef\u4ee5\u901a\u8fc7\u4f20\u5165\u4e00\u4e2a\u8bed\u8a00\u6a21\u578b\u548c\u4e00\u4e2a\u5411\u91cf\u6570\u636e\u5e93\u6765\u521b\u5efa\u5b83\u4f5c\u4e3a\u68c0\u7d22\u5668\u3002\u7136\u540e\uff0c\u6211\u4eec\u53ef\u4ee5\u7528\u95ee\u9898\u4f5c\u4e3a\u67e5\u8be2\u8c03\u7528\u5b83\uff0c\u5f97\u5230\u4e00\u4e2a\u7b54\u6848\u3002"]}, {"cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": ["# \u4f7f\u7528 ChatGPT3.5\uff0c\u6e29\u5ea6\u8bbe\u7f6e\u4e3a0\n", "from langchain.chat_models import ChatOpenAI\n", "llm = ChatOpenAI(model_name=llm_name, temperature=0)"]}, {"cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": ["# \u5bfc\u5165\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe\n", "from langchain.chains import RetrievalQA"]}, {"cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": ["# \u58f0\u660e\u4e00\u4e2a\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe\n", "qa_chain = RetrievalQA.from_chain_type(\n", " llm,\n", " retriever=vectordb.as_retriever()\n", ")"]}, {"cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": ["# \u53ef\u4ee5\u4ee5\u8be5\u65b9\u5f0f\u8fdb\u884c\u68c0\u7d22\u95ee\u7b54\n", "question = \"What are major topics for this class?\"\n", "result = qa_chain({\"query\": question})"]}, {"cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [{"data": {"text/plain": ["'The major topic for this class is machine learning. Additionally, there may be some discussion on statistics and algebra as a refresher, and later in the quarter, there may be some discussion on extensions for the material covered in the main lectures.'"]}, "execution_count": 13, "metadata": {}, "output_type": "execute_result"}], "source": ["result[\"result\"]"]}, {"cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": ["# \u53ef\u4ee5\u4ee5\u8be5\u65b9\u5f0f\u8fdb\u884c\u68c0\u7d22\u95ee\u7b54\n", "question = \"\u8fd9\u8282\u8bfe\u7684\u4e3b\u8981\u8bdd\u9898\u662f\u4ec0\u4e48\"\n", "result = qa_chain({\"query\": question})"]}, {"cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["\u4ece\u8fd9\u4e9b\u4e0a\u4e0b\u6587\u6765\u770b\uff0c\u8fd9\u8282\u8bfe\u7684\u4e3b\u8981\u8bdd\u9898\u5305\u62ec\u8bfe\u7a0b\u4fe1\u606f\u3001\u5728\u7ebf\u8d44\u6e90\u548c\u7ebf\u6027\u4ee3\u6570\u3002\n"]}], "source": ["print(result[\"result\"])"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## \u4e94\u3001\u6df1\u5165\u63a2\u7a76\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u901a\u8fc7\u4e0a\u8ff0\u4ee3\u7801\uff0c\u6211\u4eec\u53ef\u4ee5\u5b9e\u73b0\u4e00\u4e2a\u7b80\u5355\u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe\u3002\u63a5\u4e0b\u6765\uff0c\u8ba9\u6211\u4eec\u6df1\u5165\u5176\u4e2d\u7684\u7ec6\u8282\uff0c\u770b\u770b\u5728\u8fd9\u4e2a\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe\u4e2d\uff0cLangChain \u90fd\u505a\u4e86\u4e9b\u4ec0\u4e48\u3002\n"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### 5.1 \u57fa\u4e8e\u6a21\u677f\u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\n", "\u6211\u4eec\u9996\u5148\u5b9a\u4e49\u4e86\u4e00\u4e2a\u63d0\u793a\u6a21\u677f\u3002\u5b83\u5305\u542b\u4e00\u4e9b\u5173\u4e8e\u5982\u4f55\u4f7f\u7528\u4e0b\u9762\u7684\u4e0a\u4e0b\u6587\u7247\u6bb5\u7684\u8bf4\u660e\uff0c\u7136\u540e\u6709\u4e00\u4e2a\u4e0a\u4e0b\u6587\u53d8\u91cf\u7684\u5360\u4f4d\u7b26\u3002"]}, {"cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": ["from langchain.prompts import PromptTemplate\n", "\n", "# Build prompt\n", "template = \"\"\"Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say \"thanks for asking!\" at the end of the answer. \n", "{context}\n", "Question: {question}\n", "Helpful Answer:\"\"\"\n", "QA_CHAIN_PROMPT = PromptTemplate.from_template(template)\n"]}, {"cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": ["# \u4e2d\u6587\u7248\n", "from langchain.prompts import PromptTemplate\n", "\n", "# Build prompt\n", "template = \"\"\"\u4f7f\u7528\u4ee5\u4e0b\u4e0a\u4e0b\u6587\u7247\u6bb5\u6765\u56de\u7b54\u6700\u540e\u7684\u95ee\u9898\u3002\u5982\u679c\u4f60\u4e0d\u77e5\u9053\u7b54\u6848\uff0c\u53ea\u9700\u8bf4\u4e0d\u77e5\u9053\uff0c\u4e0d\u8981\u8bd5\u56fe\u7f16\u9020\u7b54\u6848\u3002\u7b54\u6848\u6700\u591a\u4f7f\u7528\u4e09\u4e2a\u53e5\u5b50\u3002\u5c3d\u91cf\u7b80\u660e\u627c\u8981\u5730\u56de\u7b54\u3002\u5728\u56de\u7b54\u7684\u6700\u540e\u4e00\u5b9a\u8981\u8bf4\"\u611f\u8c22\u60a8\u7684\u63d0\u95ee\uff01\"\n", "{context}\n", "\u95ee\u9898\uff1a{question}\n", "\u6709\u7528\u7684\u56de\u7b54\uff1a\"\"\"\n", "QA_CHAIN_PROMPT = PromptTemplate.from_template(template)\n"]}, {"cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": ["# Run chain\n", "qa_chain = RetrievalQA.from_chain_type(\n", " llm,\n", " retriever=vectordb.as_retriever(),\n", " return_source_documents=True,\n", " chain_type_kwargs={\"prompt\": QA_CHAIN_PROMPT}\n", ")"]}, {"cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": ["question = \"Is probability a class topic?\""]}, {"cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": ["result = qa_chain({\"query\": question})"]}, {"cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [{"data": {"text/plain": ["'Yes, probability is a class topic and the instructor assumes familiarity with basic probability and statistics.'"]}, "execution_count": 40, "metadata": {}, "output_type": "execute_result"}], "source": ["result[\"result\"]"]}, {"cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": ["# \u4e2d\u6587\u7248\n", "question = \"\u673a\u5668\u5b66\u4e60\u662f\u5176\u4e2d\u4e00\u8282\u7684\u8bdd\u9898\u5417\""]}, {"cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": ["result = qa_chain({\"query\": question})"]}, {"cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [{"data": {"text/plain": ["'\u662f\u7684\uff0c\u673a\u5668\u5b66\u4e60\u662f\u5176\u4e2d\u4e00\u8282\u7684\u8bdd\u9898\u3002\u611f\u8c22\u60a8\u7684\u63d0\u95ee\uff01'"]}, "execution_count": 49, "metadata": {}, "output_type": "execute_result"}], "source": ["result[\"result\"]"]}, {"cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [{"data": {"text/plain": ["Document(page_content=\"So in this class, we've tried to convey to you a broad set of principl es and tools that will \\nbe useful for doing many, many things. And ev ery time I teach this class, I can actually \\nvery confidently say that af ter December, no matter what yo u're going to do after this \\nDecember when you've sort of completed this class, you'll find the things you learn in \\nthis class very useful, and these things will be useful pretty much no matter what you end \\nup doing later in your life. \\nSo I have more logistics to go over later, but let's say a few more words about machine \\nlearning. I feel that machine learning grew out of early work in AI, early work in artificial \\nintelligence. And over the last \u2014 I wanna say last 15 or last 20 years or so, it's been viewed as a sort of growing new capability for computers. And in particular, it turns out \\nthat there are many programs or there are many applications that you can't program by \\nhand. \\nFor example, if you want to get a computer to read handwritten characters, to read sort of \\nhandwritten digits, that actual ly turns out to be amazingly difficult to write a piece of \\nsoftware to take this input, an image of some thing that I wrote and to figure out just what \\nit is, to translate my cursive handwriting into \u2014 to extract the characters I wrote out in \\nlonghand. And other things: One thing that my students and I do is autonomous flight. It \\nturns out to be extremely difficult to sit dow n and write a program to fly a helicopter.\", metadata={'source': 'docs/cs229_lectures/MachineLearning-Lecture01.pdf', 'page': 2})"]}, "execution_count": 50, "metadata": {}, "output_type": "execute_result"}], "source": ["result[\"source_documents\"][0]"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u8fd9\u79cd\u65b9\u6cd5\u975e\u5e38\u597d\uff0c\u56e0\u4e3a\u5b83\u53ea\u6d89\u53ca\u5bf9\u8bed\u8a00\u6a21\u578b\u7684\u4e00\u6b21\u8c03\u7528\u3002\u7136\u800c\uff0c\u5b83\u4e5f\u6709\u5c40\u9650\u6027\uff0c\u5373\u5982\u679c\u6587\u6863\u592a\u591a\uff0c\u53ef\u80fd\u65e0\u6cd5\u5c06\u5b83\u4eec\u5168\u90e8\u9002\u914d\u5230\u4e0a\u4e0b\u6587\u7a97\u53e3\u4e2d\u3002\u6211\u4eec\u53ef\u4ee5\u4f7f\u7528\u53e6\u4e00\u79cd\u6280\u672f\u6765\u5bf9\u6587\u6863\u8fdb\u884c\u95ee\u7b54\uff0c\u5373MapReduce\u6280\u672f\u3002"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### 5.2 \u57fa\u4e8e MapReduce \u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u5728 MapReduce \u6280\u672f\u4e2d\uff0c\u9996\u5148\u5c06\u6bcf\u4e2a\u72ec\u7acb\u7684\u6587\u6863\u5355\u72ec\u53d1\u9001\u5230\u8bed\u8a00\u6a21\u578b\u4ee5\u83b7\u53d6\u539f\u59cb\u7b54\u6848\u3002\u7136\u540e\uff0c\u8fd9\u4e9b\u7b54\u6848\u901a\u8fc7\u6700\u7ec8\u5bf9\u8bed\u8a00\u6a21\u578b\u7684\u4e00\u6b21\u8c03\u7528\u7ec4\u5408\u6210\u6700\u7ec8\u7684\u7b54\u6848\u3002\u867d\u7136\u8fd9\u6837\u6d89\u53ca\u4e86\u66f4\u591a\u5bf9\u8bed\u8a00\u6a21\u578b\u7684\u8c03\u7528\uff0c\u4f46\u5b83\u7684\u4f18\u52bf\u5728\u4e8e\u53ef\u4ee5\u5904\u7406\u4efb\u610f\u6570\u91cf\u7684\u6587\u6863\u3002\n"]}, {"cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": ["qa_chain_mr = RetrievalQA.from_chain_type(\n", " llm,\n", " retriever=vectordb.as_retriever(),\n", " chain_type=\"map_reduce\"\n", ")"]}, {"cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": ["question = \"Is probability a class topic?\"\n", "result = qa_chain_mr({\"query\": question})"]}, {"cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [{"data": {"text/plain": ["'It is not clear from the given portion of the document whether probability is a class topic or not. The text only mentions that familiarity with basic probability and statistics is assumed as a prerequisite for the class.'"]}, "execution_count": 53, "metadata": {}, "output_type": "execute_result"}], "source": ["result[\"result\"]"]}, {"cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [{"data": {"text/plain": ["'\u6839\u636e\u7ed9\u51fa\u7684\u6587\u4ef6\u90e8\u5206\uff0c\u6ca1\u6709\u63d0\u5230\u6982\u7387\u8bba\u3002'"]}, "execution_count": 55, "metadata": {}, "output_type": "execute_result"}], "source": ["qa_chain_mr = RetrievalQA.from_chain_type(\n", " llm,\n", " retriever=vectordb.as_retriever(),\n", " chain_type=\"map_reduce\"\n", ")\n", "# \u4e2d\u6587\u7248\n", "question = \"\u6982\u7387\u8bba\u662f\u5176\u4e2d\u4e00\u8282\u7684\u8bdd\u9898\u5417\"\n", "result = qa_chain_mr({\"query\": question})\n", "result[\"result\"]"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u5f53\u6211\u4eec\u5c06\u4e4b\u524d\u7684\u95ee\u9898\u901a\u8fc7\u8fd9\u4e2a\u94fe\u8fdb\u884c\u8fd0\u884c\u65f6\uff0c\u6211\u4eec\u53ef\u4ee5\u770b\u5230\u8fd9\u79cd\u65b9\u6cd5\u7684\u4e24\u4e2a\u95ee\u9898\u3002\u7b2c\u4e00\uff0c\u901f\u5ea6\u8981\u6162\u5f97\u591a\u3002\u7b2c\u4e8c\uff0c\u7ed3\u679c\u5b9e\u9645\u4e0a\u66f4\u5dee\u3002\u6839\u636e\u7ed9\u5b9a\u6587\u6863\u7684\u8fd9\u4e00\u90e8\u5206\uff0c\u5bf9\u8fd9\u4e2a\u95ee\u9898\u5e76\u6ca1\u6709\u660e\u786e\u7684\u7b54\u6848\u3002\u8fd9\u53ef\u80fd\u662f\u56e0\u4e3a\u5b83\u662f\u57fa\u4e8e\u6bcf\u4e2a\u6587\u6863\u5355\u72ec\u56de\u7b54\u7684\u3002\u56e0\u6b64\uff0c\u5982\u679c\u4fe1\u606f\u5206\u5e03\u5728\u4e24\u4e2a\u6587\u6863\u4e4b\u95f4\uff0c\u5b83\u5e76\u6ca1\u6709\u5728\u540c\u4e00\u4e0a\u4e0b\u6587\u4e2d\u83b7\u53d6\u5230\u6240\u6709\u7684\u4fe1\u606f\u3002"]}, {"cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": ["#import os\n", "#os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n", "#os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.langchain.plus\"\n", "#os.environ[\"LANGCHAIN_API_KEY\"] = \"...\" # replace dots with your api key"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u6211\u4eec\u53ef\u5bfc\u5165\u4e0a\u8ff0\u73af\u5883\u53d8\u91cf\uff0c\u7136\u540e\u63a2\u5bfb MapReduce \u6587\u6863\u94fe\u7684\u7ec6\u8282\u3002\u4f8b\u5982\uff0c\u4e0a\u8ff0\u6f14\u793a\u4e2d\uff0c\u6211\u4eec\u5b9e\u9645\u4e0a\u6d89\u53ca\u4e86\u56db\u4e2a\u5355\u72ec\u7684\u5bf9\u8bed\u8a00\u6a21\u578b\u7684\u8c03\u7528\u3002\u5728\u8fd0\u884c\u5b8c\u6bcf\u4e2a\u6587\u6863\u540e\uff0c\u5b83\u4eec\u4f1a\u5728\u6700\u7ec8\u94fe\u5f0f\u4e2d\u7ec4\u5408\u5728\u4e00\u8d77\uff0c\u5373Stuffed Documents\u94fe\uff0c\u5c06\u6240\u6709\u8fd9\u4e9b\u56de\u7b54\u5408\u5e76\u5230\u6700\u7ec8\u7684\u8c03\u7528\u4e2d\u3002"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### 5.3 \u57fa\u4e8e Refine \u7684\u68c0\u7d22\u5f0f\u95ee\u7b54\u94fe"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u6211\u4eec\u53ef\u4ee5\u7c7b\u4f3c\u5730\u8bbe\u7f6e\u94fe\u5f0f\u7c7b\u578b\u4e3aRefine\u3002\u8fd9\u662f\u4e00\u79cd\u65b0\u7684\u94fe\u5f0f\u7c7b\u578b\u3002Refine \u6587\u6863\u94fe\u7c7b\u4f3c\u4e8e MapReduce \u94fe\uff0c\u5bf9\u4e8e\u6bcf\u4e00\u4e2a\u6587\u6863\uff0c\u4f1a\u8c03\u7528\u4e00\u6b21 LLM\uff0c\u4f46\u6709\u6240\u6539\u8fdb\u7684\u662f\uff0c\u6211\u4eec\u6bcf\u6b21\u53d1\u9001\u7ed9 LLM \u7684\u6700\u7ec8\u63d0\u793a\u662f\u4e00\u4e2a\u5e8f\u5217\uff0c\u8fd9\u4e2a\u5e8f\u5217\u4f1a\u5c06\u5148\u524d\u7684\u54cd\u5e94\u4e0e\u65b0\u6570\u636e\u7ed3\u5408\u5728\u4e00\u8d77\uff0c\u5e76\u8bf7\u6c42\u5f97\u5230\u6539\u8fdb\u540e\u7684\u54cd\u5e94\u3002\u56e0\u6b64\uff0c\u8fd9\u662f\u4e00\u79cd\u7c7b\u4f3c\u4e8e RNN \u7684\u6982\u5ff5\uff0c\u6211\u4eec\u589e\u5f3a\u4e86\u4e0a\u4e0b\u6587\uff0c\u4ece\u800c\u89e3\u51b3\u4fe1\u606f\u5206\u5e03\u5728\u4e0d\u540c\u6587\u6863\u7684\u95ee\u9898\u3002"]}, {"cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [{"data": {"text/plain": ["'Based on the additional context provided, probability is assumed to be a prerequisite and not a main topic of the class. The instructor assumes that students are familiar with basic probability and statistics, including random variables, expectation, variance, and basic linear algebra. The class will not be very programming-intensive, but some programming will be done in MATLAB or Octave. The instructor will provide a refresher course on the prerequisites in some of the discussion sections. The class also assumes familiarity with basic linear algebra, including matrices, vectors, matrix multiplication, and matrix inverse. Most undergraduate linear algebra courses, such as Math 51, 103, Math 113, or CS205 at Stanford, are more than enough. The instructor will also review the prerequisites in some of the discussion sections.'"]}, "execution_count": 56, "metadata": {}, "output_type": "execute_result"}], "source": ["qa_chain_mr = RetrievalQA.from_chain_type(\n", " llm,\n", " retriever=vectordb.as_retriever(),\n", " chain_type=\"refine\"\n", ")\n", "question = \"Is probability a class topic?\"\n", "result = qa_chain_mr({\"query\": question})\n", "result[\"result\"]"]}, {"cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [{"data": {"text/plain": ["'Based on the additional context provided, the instructor mentions that they will cover statistics and algebra in the discussion sections as a refresher, and will also use the discussion sections to go over extensions of the material taught in the main lectures. However, there is no explicit mention of probability theory being covered in the course. Therefore, the original answer still stands.'"]}, "execution_count": 57, "metadata": {}, "output_type": "execute_result"}], "source": ["qa_chain_mr = RetrievalQA.from_chain_type(\n", " llm,\n", " retriever=vectordb.as_retriever(),\n", " chain_type=\"refine\"\n", ")\n", "question = \"\u6982\u7387\u8bba\u662f\u5176\u4e2d\u4e00\u8282\u7684\u8bdd\u9898\u5417\"\n", "result = qa_chain_mr({\"query\": question})\n", "result[\"result\"]"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u4f60\u4f1a\u6ce8\u610f\u5230\uff0c\u8fd9\u4e2a\u7ed3\u679c\u6bd4MapReduce\u94fe\u7684\u7ed3\u679c\u8981\u597d\u3002\u8fd9\u662f\u56e0\u4e3a\u4f7f\u7528Refined Chain\u5141\u8bb8\u4f60\u9010\u4e2a\u5730\u7ec4\u5408\u4fe1\u606f\uff0c\u5b9e\u9645\u4e0a\u6bd4MapReduce\u94fe\u9f13\u52b1\u66f4\u591a\u7684\u4fe1\u606f\u4f20\u9012\u3002"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## \u516d\u3001\u5b9e\u9a8c\uff1a\u72b6\u6001\u8bb0\u5f55"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u8ba9\u6211\u4eec\u5728\u8fd9\u91cc\u505a\u4e00\u4e2a\u5b9e\u9a8c\u3002\n", "\n", "\u6211\u4eec\u5c06\u521b\u5efa\u4e00\u4e2aQA\u94fe\uff0c\u4f7f\u7528\u9ed8\u8ba4\u7684stuff\u3002\u8ba9\u6211\u4eec\u95ee\u4e00\u4e2a\u95ee\u9898\uff0c\u6982\u7387\u8bba\u662f\u8bfe\u7a0b\u7684\u4e3b\u9898\u5417\uff1f\u5b83\u4f1a\u56de\u7b54\uff0c\u6982\u7387\u8bba\u5e94\u8be5\u662f\u5148\u51b3\u6761\u4ef6\u3002\u6211\u4eec\u5c06\u8ffd\u95ee\uff0c\u4e3a\u4ec0\u4e48\u9700\u8981\u8fd9\u4e9b\u5148\u51b3\u6761\u4ef6\uff1f\u7136\u540e\u6211\u4eec\u5f97\u5230\u4e86\u4e00\u4e2a\u7b54\u6848\u3002\u8fd9\u95e8\u8bfe\u7684\u5148\u51b3\u6761\u4ef6\u662f\u5047\u5b9a\u5177\u6709\u8ba1\u7b97\u673a\u79d1\u5b66\u548c\u57fa\u672c\u8ba1\u7b97\u673a\u6280\u80fd\u548c\u539f\u7406\u7684\u57fa\u672c\u77e5\u8bc6\u3002\u8fd9\u4e0e\u4e4b\u524d\u95ee\u6709\u5173\u6982\u7387\u7684\u95ee\u9898\u6beb\u4e0d\u76f8\u5173\u3002"]}, {"cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": ["qa_chain = RetrievalQA.from_chain_type(\n", " llm,\n", " retriever=vectordb.as_retriever()\n", ")"]}, {"cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [{"data": {"text/plain": ["'Yes, probability is a topic in this class. The speaker assumes that students have familiarity with basic probability and statistics, and mentions that most undergraduate statistics classes will be more than enough preparation for this class.'"]}, "execution_count": 59, "metadata": {}, "output_type": "execute_result"}], "source": ["question = \"Is probability a class topic?\"\n", "result = qa_chain({\"query\": question})\n", "result[\"result\"]"]}, {"cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [{"data": {"text/plain": ["'The prerequisites are needed because in this class, the instructor assumes that all students have a basic knowledge of computer science and knowledge of basic computer skills and principles. This includes understanding of big-O notation and other fundamental concepts. Without this basic knowledge, it may be difficult to understand the material covered in the class.'"]}, "execution_count": 60, "metadata": {}, "output_type": "execute_result"}], "source": ["question = \"why are those prerequesites needed?\"\n", "result = qa_chain({\"query\": question})\n", "result[\"result\"]"]}, {"cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [{"data": {"text/plain": ["'\u662f\u7684\uff0c\u4f5c\u8005\u5728\u6587\u4e2d\u63d0\u5230\u4e86\u8fd9\u95e8\u8bfe\u7a0b\u9700\u8981\u5b66\u751f\u5177\u5907\u57fa\u672c\u7684\u6982\u7387\u8bba\u548c\u7edf\u8ba1\u5b66\u77e5\u8bc6\u3002'"]}, "execution_count": 62, "metadata": {}, "output_type": "execute_result"}], "source": ["question = \"\u6982\u7387\u8bba\u662f\u8fd9\u8282\u8bfe\u7684\u4e00\u4e2a\u5185\u5bb9\u5417\"\n", "result = qa_chain({\"query\": question})\n", "result[\"result\"]"]}, {"cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [{"data": {"text/plain": ["'\u5728\u8fd9\u6bb5\u4e0a\u4e0b\u6587\u4e2d\uff0c\u4f5c\u8005\u63d0\u5230\u8fd9\u4e9b\u77e5\u8bc6\u662f\u8fd9\u95e8\u8bfe\u7a0b\u7684\u5148\u51b3\u6761\u4ef6\uff0c\u56e0\u4e3a\u8fd9\u95e8\u8bfe\u7a0b\u6d89\u53ca\u5230\u673a\u5668\u5b66\u4e60\u7684\u57fa\u672c\u6982\u5ff5\u548c\u7b97\u6cd5\uff0c\u9700\u8981\u5b66\u751f\u5177\u5907\u8ba1\u7b97\u673a\u79d1\u5b66\u548c\u57fa\u672c\u8ba1\u7b97\u673a\u6280\u80fd\u548c\u539f\u7406\u7684\u57fa\u672c\u77e5\u8bc6\u3002\u5982\u679c\u5b66\u751f\u6ca1\u6709\u8fd9\u4e9b\u57fa\u7840\u77e5\u8bc6\uff0c\u53ef\u80fd\u4f1a\u5f88\u96be\u7406\u89e3\u548c\u5e94\u7528\u673a\u5668\u5b66\u4e60\u7b97\u6cd5\u3002\u56e0\u6b64\uff0c\u5b66\u751f\u9700\u8981\u5177\u5907\u8fd9\u4e9b\u77e5\u8bc6\u624d\u80fd\u66f4\u597d\u5730\u5b66\u4e60\u548c\u5e94\u7528\u673a\u5668\u5b66\u4e60\u3002'"]}, "execution_count": 63, "metadata": {}, "output_type": "execute_result"}], "source": ["question = \"\u4e3a\u4ec0\u4e48\u9700\u8981\u5177\u5907\u8fd9\u4e9b\u77e5\u8bc6\"\n", "result = qa_chain({\"query\": question})\n", "result[\"result\"]"]}, {"cell_type": "markdown", "metadata": {}, "source": ["\u57fa\u672c\u4e0a\uff0c\u6211\u4eec\u4f7f\u7528\u7684\u94fe\u5f0f\uff08chain\uff09\u6ca1\u6709\u4efb\u4f55\u72b6\u6001\u7684\u6982\u5ff5\u3002\u5b83\u4e0d\u8bb0\u5f97\u4e4b\u524d\u7684\u95ee\u9898\u6216\u4e4b\u524d\u7684\u7b54\u6848\u3002\u4e3a\u4e86\u5b9e\u73b0\u8fd9\u4e00\u70b9\uff0c\u6211\u4eec\u9700\u8981\u5f15\u5165\u5185\u5b58\uff0c\u8fd9\u662f\u6211\u4eec\u5c06\u5728\u4e0b\u4e00\u8282\u4e2d\u8ba8\u8bba\u7684\u5185\u5bb9\u3002"]}], "metadata": {"kernelspec": {"display_name": "gpt", "language": "python", "name": "python3"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11"}, "orig_nbformat": 4}, "nbformat": 4, "nbformat_minor": 2}
\ No newline at end of file
diff --git "a/content/\345\277\205\344\277\256\345\233\233-LangChain Chat with Your Data/7.\350\201\212\345\244\251 Chat.ipynb" "b/content/\345\277\205\344\277\256\345\233\233-LangChain Chat with Your Data/7.\350\201\212\345\244\251 Chat.ipynb"
index ec10d810..82d42aac 100644
--- "a/content/\345\277\205\344\277\256\345\233\233-LangChain Chat with Your Data/7.\350\201\212\345\244\251 Chat.ipynb"
+++ "b/content/\345\277\205\344\277\256\345\233\233-LangChain Chat with Your Data/7.\350\201\212\345\244\251 Chat.ipynb"
@@ -1 +1 @@
-{"cells":[{"cell_type":"markdown","id":"956ff3a4","metadata":{},"source":["# 第七章、聊天 Chat\n","\n"," - [一、环境配置](#一、环境配置)\n"," - [二、记忆(Memory)](#二、记忆(Memory))\n"," - [三、对话检索链(ConversationalRetrievalChain)](#三、对话检索链(ConversationalRetrievalChain))\n"," - [四、创建一个适用于您文档的聊天机器人](#四、创建一个适用于您文档的聊天机器人)\n"," - [五、创建聊天机器人](#五、创建聊天机器人)\n"," - [六、致谢](#六、致谢)\n"]},{"cell_type":"markdown","id":"6f2e2cd9","metadata":{},"source":["回想一下检索增强生成 (retrieval augmented generation,RAG) 的整体工作流程:"]},{"attachments":{"overview.jpeg":{"image/jpeg":""}},"cell_type":"markdown","id":"231abd66","metadata":{},"source":[""]},{"cell_type":"markdown","id":"5e6a4d0c","metadata":{},"source":["我们已经接近完成一个功能性的聊天机器人了。我们讨论了`文档加载`、`切分`、`存储`和`检索`。我们展示了如何使用`检索 QA`链在 Q+A 中使用`检索`生成输出。\n","\n","我们的机器人已经可以回答问题了,但还无法处理后续问题,无法进行真正的对话。好消息是,在本章中,我们将解决这个问题。\n","\n","我们现在将创建一个问答聊天机器人。它与之前非常相似,但我们将添加聊天历史的功能。这是您之前进行的任何对话或消息。这将使机器人在尝试回答问题时能够考虑到聊天历史的上下文。所以,如果您继续提问,它会知道您想谈论什么。"]},{"cell_type":"markdown","id":"1f698037","metadata":{},"source":["## 一、环境配置"]},{"cell_type":"code","execution_count":1,"id":"926fc482-dd03-40e1-8083-171aad3e2b26","metadata":{"height":232,"tags":[]},"outputs":[{"data":{"application/javascript":"(function(root) {\n function now() {\n return new Date();\n }\n\n var force = true;\n var py_version = '3.4.1'.replace('rc', '-rc.').replace('.dev', '-dev.');\n var reloading = false;\n var Bokeh = root.Bokeh;\n\n if (typeof (root._bokeh_timeout) === \"undefined\" || force) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) {\n if (callback != null)\n callback();\n });\n } finally {\n delete root._bokeh_onload_callbacks;\n }\n console.debug(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(css_urls, js_urls, js_modules, js_exports, callback) {\n if (css_urls == null) css_urls = [];\n if (js_urls == null) js_urls = [];\n if (js_modules == null) js_modules = [];\n if (js_exports == null) js_exports = {};\n\n root._bokeh_onload_callbacks.push(callback);\n\n if (root._bokeh_is_loading > 0) {\n console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls.length === 0 && js_modules.length === 0 && Object.keys(js_exports).length === 0) {\n run_callbacks();\n return null;\n }\n if (!reloading) {\n console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n }\n\n function on_load() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n run_callbacks()\n }\n }\n window._bokeh_on_load = on_load\n\n function on_error() {\n console.error(\"failed to load \" + url);\n }\n\n var skip = [];\n if (window.requirejs) {\n window.requirejs.config({'packages': {}, 'paths': {}, 'shim': {}});\n root._bokeh_is_loading = css_urls.length + 0;\n } else {\n root._bokeh_is_loading = css_urls.length + js_urls.length + js_modules.length + Object.keys(js_exports).length;\n }\n\n var existing_stylesheets = []\n var links = document.getElementsByTagName('link')\n for (var i = 0; i < links.length; i++) {\n var link = links[i]\n if (link.href != null) {\n\texisting_stylesheets.push(link.href)\n }\n }\n for (var i = 0; i < css_urls.length; i++) {\n var url = css_urls[i];\n if (existing_stylesheets.indexOf(url) !== -1) {\n\ton_load()\n\tcontinue;\n }\n const element = document.createElement(\"link\");\n element.onload = on_load;\n element.onerror = on_error;\n element.rel = \"stylesheet\";\n element.type = \"text/css\";\n element.href = url;\n console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n document.body.appendChild(element);\n } var existing_scripts = []\n var scripts = document.getElementsByTagName('script')\n for (var i = 0; i < scripts.length; i++) {\n var script = scripts[i]\n if (script.src != null) {\n\texisting_scripts.push(script.src)\n }\n }\n for (var i = 0; i < js_urls.length; i++) {\n var url = js_urls[i];\n if (skip.indexOf(url) !== -1 || existing_scripts.indexOf(url) !== -1) {\n\tif (!window.requirejs) {\n\t on_load();\n\t}\n\tcontinue;\n }\n var element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error;\n element.async = false;\n element.src = url;\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n for (var i = 0; i < js_modules.length; i++) {\n var url = js_modules[i];\n if (skip.indexOf(url) !== -1 || existing_scripts.indexOf(url) !== -1) {\n\tif (!window.requirejs) {\n\t on_load();\n\t}\n\tcontinue;\n }\n var element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error;\n element.async = false;\n element.src = url;\n element.type = \"module\";\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n for (const name in js_exports) {\n var url = js_exports[name];\n if (skip.indexOf(url) >= 0 || root[name] != null) {\n\tif (!window.requirejs) {\n\t on_load();\n\t}\n\tcontinue;\n }\n var element = document.createElement('script');\n element.onerror = on_error;\n element.async = false;\n element.type = \"module\";\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n element.textContent = `\n import ${name} from \"${url}\"\n window.${name} = ${name}\n window._bokeh_on_load()\n `\n document.head.appendChild(element);\n }\n if (!js_urls.length && !js_modules.length) {\n on_load()\n }\n };\n\n function inject_raw_css(css) {\n const element = document.createElement(\"style\");\n element.appendChild(document.createTextNode(css));\n document.body.appendChild(element);\n }\n\n var js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-3.4.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.4.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.4.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.4.1.min.js\", \"https://cdn.holoviz.org/panel/1.4.4/dist/panel.min.js\"];\n var js_modules = [];\n var js_exports = {};\n var css_urls = [];\n var inline_js = [ function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\nfunction(Bokeh) {} // ensure no trailing comma for IE\n ];\n\n function run_inline_js() {\n if ((root.Bokeh !== undefined) || (force === true)) {\n for (var i = 0; i < inline_js.length; i++) {\n\ttry {\n inline_js[i].call(root, root.Bokeh);\n\t} catch(e) {\n\t if (!reloading) {\n\t throw e;\n\t }\n\t}\n }\n // Cache old bokeh versions\n if (Bokeh != undefined && !reloading) {\n\tvar NewBokeh = root.Bokeh;\n\tif (Bokeh.versions === undefined) {\n\t Bokeh.versions = new Map();\n\t}\n\tif (NewBokeh.version !== Bokeh.version) {\n\t Bokeh.versions.set(NewBokeh.version, NewBokeh)\n\t}\n\troot.Bokeh = Bokeh;\n }} else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n }\n root._bokeh_is_initializing = false\n }\n\n function load_or_wait() {\n // Implement a backoff loop that tries to ensure we do not load multiple\n // versions of Bokeh and its dependencies at the same time.\n // In recent versions we use the root._bokeh_is_initializing flag\n // to determine whether there is an ongoing attempt to initialize\n // bokeh, however for backward compatibility we also try to ensure\n // that we do not start loading a newer (Panel>=1.0 and Bokeh>3) version\n // before older versions are fully initialized.\n if (root._bokeh_is_initializing && Date.now() > root._bokeh_timeout) {\n root._bokeh_is_initializing = false;\n root._bokeh_onload_callbacks = undefined;\n console.log(\"Bokeh: BokehJS was loaded multiple times but one version failed to initialize.\");\n load_or_wait();\n } else if (root._bokeh_is_initializing || (typeof root._bokeh_is_initializing === \"undefined\" && root._bokeh_onload_callbacks !== undefined)) {\n setTimeout(load_or_wait, 100);\n } else {\n root._bokeh_is_initializing = true\n root._bokeh_onload_callbacks = []\n var bokeh_loaded = Bokeh != null && (Bokeh.version === py_version || (Bokeh.versions !== undefined && Bokeh.versions.has(py_version)));\n if (!reloading && !bokeh_loaded) {\n\troot.Bokeh = undefined;\n }\n load_libs(css_urls, js_urls, js_modules, js_exports, function() {\n\tconsole.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n\trun_inline_js();\n });\n }\n }\n // Give older versions of the autoload script a head-start to ensure\n // they initialize before we start loading newer version.\n setTimeout(load_or_wait, 100)\n}(window));","application/vnd.holoviews_load.v0+json":""},"metadata":{},"output_type":"display_data"},{"data":{"application/javascript":"\nif ((window.PyViz === undefined) || (window.PyViz instanceof HTMLElement)) {\n window.PyViz = {comms: {}, comm_status:{}, kernels:{}, receivers: {}, plot_index: []}\n}\n\n\n function JupyterCommManager() {\n }\n\n JupyterCommManager.prototype.register_target = function(plot_id, comm_id, msg_handler) {\n if (window.comm_manager || ((window.Jupyter !== undefined) && (Jupyter.notebook.kernel != null))) {\n var comm_manager = window.comm_manager || Jupyter.notebook.kernel.comm_manager;\n comm_manager.register_target(comm_id, function(comm) {\n comm.on_msg(msg_handler);\n });\n } else if ((plot_id in window.PyViz.kernels) && (window.PyViz.kernels[plot_id])) {\n window.PyViz.kernels[plot_id].registerCommTarget(comm_id, function(comm) {\n comm.onMsg = msg_handler;\n });\n } else if (typeof google != 'undefined' && google.colab.kernel != null) {\n google.colab.kernel.comms.registerTarget(comm_id, (comm) => {\n var messages = comm.messages[Symbol.asyncIterator]();\n function processIteratorResult(result) {\n var message = result.value;\n console.log(message)\n var content = {data: message.data, comm_id};\n var buffers = []\n for (var buffer of message.buffers || []) {\n buffers.push(new DataView(buffer))\n }\n var metadata = message.metadata || {};\n var msg = {content, buffers, metadata}\n msg_handler(msg);\n return messages.next().then(processIteratorResult);\n }\n return messages.next().then(processIteratorResult);\n })\n }\n }\n\n JupyterCommManager.prototype.get_client_comm = function(plot_id, comm_id, msg_handler) {\n if (comm_id in window.PyViz.comms) {\n return window.PyViz.comms[comm_id];\n } else if (window.comm_manager || ((window.Jupyter !== undefined) && (Jupyter.notebook.kernel != null))) {\n var comm_manager = window.comm_manager || Jupyter.notebook.kernel.comm_manager;\n var comm = comm_manager.new_comm(comm_id, {}, {}, {}, comm_id);\n if (msg_handler) {\n comm.on_msg(msg_handler);\n }\n } else if ((plot_id in window.PyViz.kernels) && (window.PyViz.kernels[plot_id])) {\n var comm = window.PyViz.kernels[plot_id].connectToComm(comm_id);\n comm.open();\n if (msg_handler) {\n comm.onMsg = msg_handler;\n }\n } else if (typeof google != 'undefined' && google.colab.kernel != null) {\n var comm_promise = google.colab.kernel.comms.open(comm_id)\n comm_promise.then((comm) => {\n window.PyViz.comms[comm_id] = comm;\n if (msg_handler) {\n var messages = comm.messages[Symbol.asyncIterator]();\n function processIteratorResult(result) {\n var message = result.value;\n var content = {data: message.data};\n var metadata = message.metadata || {comm_id};\n var msg = {content, metadata}\n msg_handler(msg);\n return messages.next().then(processIteratorResult);\n }\n return messages.next().then(processIteratorResult);\n }\n }) \n var sendClosure = (data, metadata, buffers, disposeOnDone) => {\n return comm_promise.then((comm) => {\n comm.send(data, metadata, buffers, disposeOnDone);\n });\n };\n var comm = {\n send: sendClosure\n };\n }\n window.PyViz.comms[comm_id] = comm;\n return comm;\n }\n window.PyViz.comm_manager = new JupyterCommManager();\n \n\n\nvar JS_MIME_TYPE = 'application/javascript';\nvar HTML_MIME_TYPE = 'text/html';\nvar EXEC_MIME_TYPE = 'application/vnd.holoviews_exec.v0+json';\nvar CLASS_NAME = 'output';\n\n/**\n * Render data to the DOM node\n */\nfunction render(props, node) {\n var div = document.createElement(\"div\");\n var script = document.createElement(\"script\");\n node.appendChild(div);\n node.appendChild(script);\n}\n\n/**\n * Handle when a new output is added\n */\nfunction handle_add_output(event, handle) {\n var output_area = handle.output_area;\n var output = handle.output;\n if ((output.data == undefined) || (!output.data.hasOwnProperty(EXEC_MIME_TYPE))) {\n return\n }\n var id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n var toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n if (id !== undefined) {\n var nchildren = toinsert.length;\n var html_node = toinsert[nchildren-1].children[0];\n html_node.innerHTML = output.data[HTML_MIME_TYPE];\n var scripts = [];\n var nodelist = html_node.querySelectorAll(\"script\");\n for (var i in nodelist) {\n if (nodelist.hasOwnProperty(i)) {\n scripts.push(nodelist[i])\n }\n }\n\n scripts.forEach( function (oldScript) {\n var newScript = document.createElement(\"script\");\n var attrs = [];\n var nodemap = oldScript.attributes;\n for (var j in nodemap) {\n if (nodemap.hasOwnProperty(j)) {\n attrs.push(nodemap[j])\n }\n }\n attrs.forEach(function(attr) { newScript.setAttribute(attr.name, attr.value) });\n newScript.appendChild(document.createTextNode(oldScript.innerHTML));\n oldScript.parentNode.replaceChild(newScript, oldScript);\n });\n if (JS_MIME_TYPE in output.data) {\n toinsert[nchildren-1].children[1].textContent = output.data[JS_MIME_TYPE];\n }\n output_area._hv_plot_id = id;\n if ((window.Bokeh !== undefined) && (id in Bokeh.index)) {\n window.PyViz.plot_index[id] = Bokeh.index[id];\n } else {\n window.PyViz.plot_index[id] = null;\n }\n } else if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n var bk_div = document.createElement(\"div\");\n bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n var script_attrs = bk_div.children[0].attributes;\n for (var i = 0; i < script_attrs.length; i++) {\n toinsert[toinsert.length - 1].childNodes[1].setAttribute(script_attrs[i].name, script_attrs[i].value);\n }\n // store reference to server id on output_area\n output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n }\n}\n\n/**\n * Handle when an output is cleared or removed\n */\nfunction handle_clear_output(event, handle) {\n var id = handle.cell.output_area._hv_plot_id;\n var server_id = handle.cell.output_area._bokeh_server_id;\n if (((id === undefined) || !(id in PyViz.plot_index)) && (server_id !== undefined)) { return; }\n var comm = window.PyViz.comm_manager.get_client_comm(\"hv-extension-comm\", \"hv-extension-comm\", function () {});\n if (server_id !== null) {\n comm.send({event_type: 'server_delete', 'id': server_id});\n return;\n } else if (comm !== null) {\n comm.send({event_type: 'delete', 'id': id});\n }\n delete PyViz.plot_index[id];\n if ((window.Bokeh !== undefined) & (id in window.Bokeh.index)) {\n var doc = window.Bokeh.index[id].model.document\n doc.clear();\n const i = window.Bokeh.documents.indexOf(doc);\n if (i > -1) {\n window.Bokeh.documents.splice(i, 1);\n }\n }\n}\n\n/**\n * Handle kernel restart event\n */\nfunction handle_kernel_cleanup(event, handle) {\n delete PyViz.comms[\"hv-extension-comm\"];\n window.PyViz.plot_index = {}\n}\n\n/**\n * Handle update_display_data messages\n */\nfunction handle_update_output(event, handle) {\n handle_clear_output(event, {cell: {output_area: handle.output_area}})\n handle_add_output(event, handle)\n}\n\nfunction register_renderer(events, OutputArea) {\n function append_mime(data, metadata, element) {\n // create a DOM node to render to\n var toinsert = this.create_output_subarea(\n metadata,\n CLASS_NAME,\n EXEC_MIME_TYPE\n );\n this.keyboard_manager.register_events(toinsert);\n // Render to node\n var props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n render(props, toinsert[0]);\n element.append(toinsert);\n return toinsert\n }\n\n events.on('output_added.OutputArea', handle_add_output);\n events.on('output_updated.OutputArea', handle_update_output);\n events.on('clear_output.CodeCell', handle_clear_output);\n events.on('delete.Cell', handle_clear_output);\n events.on('kernel_ready.Kernel', handle_kernel_cleanup);\n\n OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n safe: true,\n index: 0\n });\n}\n\nif (window.Jupyter !== undefined) {\n try {\n var events = require('base/js/events');\n var OutputArea = require('notebook/js/outputarea').OutputArea;\n if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n register_renderer(events, OutputArea);\n }\n } catch(err) {\n }\n}\n","application/vnd.holoviews_load.v0+json":""},"metadata":{},"output_type":"display_data"},{"data":{"text/html":[""]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.holoviews_exec.v0+json":"","text/html":["\n",""]},"metadata":{"application/vnd.holoviews_exec.v0+json":{"id":"fe2d2700-51b2-47ec-874d-c95a52a94823"}},"output_type":"display_data"}],"source":["import os\n","import openai\n","import sys\n","sys.path.append('../..')\n","\n","import panel as pn # GUI\n","pn.extension()\n","\n","from dotenv import load_dotenv, find_dotenv\n","_ = load_dotenv(find_dotenv()) # read local .env file\n","\n","openai.api_key = os.environ['OPENAI_API_KEY']"]},{"cell_type":"markdown","id":"f71bd908","metadata":{"height":30},"source":["以下代码是为了 openai LLM 版本备案,直至其被弃用(于 2023 年 9 月)。LLM 响应通常会有所不同,但在使用不同模型版本时,这种差异可能会更明显。"]},{"cell_type":"code","execution_count":2,"id":"83558969-e5aa-4fbc-933c-9c1534decf0a","metadata":{"height":147,"tags":[]},"outputs":[{"name":"stdout","output_type":"stream","text":["gpt-3.5-turbo\n"]}],"source":["import datetime\n","current_date = datetime.datetime.now().date()\n","if current_date < datetime.date(2023, 9, 2):\n"," llm_name = \"gpt-3.5-turbo-0301\"\n","else:\n"," llm_name = \"gpt-3.5-turbo\"\n","print(llm_name)"]},{"cell_type":"markdown","id":"baddf039","metadata":{},"source":["如果您想在 `Lang Chain plus` 平台上进行实验:\n","\n","- 前往 langchain plus 平台并注册\n","\n","- 从您的帐户设置创建 api 密钥\n","\n","- 在下面的代码中使用此 api 密钥"]},{"cell_type":"code","execution_count":3,"id":"ba19ea7d","metadata":{"height":96,"tags":[]},"outputs":[],"source":["#import os\n","#os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n","#os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.langchain.plus\"\n","#os.environ[\"LANGCHAIN_API_KEY\"] = \"...\""]},{"cell_type":"code","execution_count":4,"id":"ab80dc86","metadata":{"height":113,"tags":[]},"outputs":[{"name":"stderr","output_type":"stream","text":["/Users/lta/anaconda3/envs/chat_data/lib/python3.11/site-packages/langchain_core/_api/deprecation.py:119: LangChainDeprecationWarning: The class `OpenAIEmbeddings` was deprecated in LangChain 0.0.9 and will be removed in 0.3.0. An updated version of the class exists in the langchain-openai package and should be used instead. To use it run `pip install -U langchain-openai` and import as `from langchain_openai import OpenAIEmbeddings`.\n"," warn_deprecated(\n"]}],"source":["# 加载向量库,其中包含了所有课程材料的 Embedding。\n","from langchain.vectorstores import Chroma\n","from langchain.embeddings.openai import OpenAIEmbeddings\n","persist_directory = 'docs/chroma/'\n","embedding = OpenAIEmbeddings(model='text-embedding-3-small')\n","vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)"]},{"cell_type":"code","execution_count":5,"id":"b7a409bf","metadata":{"height":79,"tags":[]},"outputs":[{"data":{"text/plain":["0"]},"execution_count":5,"metadata":{},"output_type":"execute_result"}],"source":["# 对向量库进行基本的相似度搜索\n","question = \"What are major topics for this class?\"\n","docs = vectordb.similarity_search(question,k=3)\n","len(docs)"]},{"cell_type":"code","execution_count":6,"id":"d59d01b0","metadata":{"height":79,"tags":[]},"outputs":[{"data":{"text/plain":["0"]},"execution_count":6,"metadata":{},"output_type":"execute_result"}],"source":["question = \"这门课的主要内容是什么?\"\n","docs = vectordb.similarity_search(question,k=3)\n","len(docs)"]},{"cell_type":"code","execution_count":7,"id":"01c58c2b","metadata":{"height":79,"tags":[]},"outputs":[{"name":"stderr","output_type":"stream","text":["/Users/lta/anaconda3/envs/chat_data/lib/python3.11/site-packages/langchain_core/_api/deprecation.py:119: LangChainDeprecationWarning: The class `ChatOpenAI` was deprecated in LangChain 0.0.10 and will be removed in 0.3.0. An updated version of the class exists in the langchain-openai package and should be used instead. To use it run `pip install -U langchain-openai` and import as `from langchain_openai import ChatOpenAI`.\n"," warn_deprecated(\n","/Users/lta/anaconda3/envs/chat_data/lib/python3.11/site-packages/langchain_core/_api/deprecation.py:119: LangChainDeprecationWarning: The method `BaseChatModel.predict` was deprecated in langchain-core 0.1.7 and will be removed in 0.3.0. Use invoke instead.\n"," warn_deprecated(\n"]},{"data":{"text/plain":["'Hello! How can I assist you today?'"]},"execution_count":7,"metadata":{},"output_type":"execute_result"}],"source":["from langchain.chat_models.openai import ChatOpenAI\n","llm = ChatOpenAI(model_name=llm_name, temperature=0)\n","llm.predict(\"Hello world!\")"]},{"cell_type":"code","execution_count":8,"id":"d683c6ed","metadata":{"height":336},"outputs":[{"name":"stderr","output_type":"stream","text":["/Users/lta/anaconda3/envs/chat_data/lib/python3.11/site-packages/langchain_core/_api/deprecation.py:119: LangChainDeprecationWarning: The method `Chain.__call__` was deprecated in langchain 0.1.0 and will be removed in 0.3.0. Use invoke instead.\n"," warn_deprecated(\n"]},{"data":{"text/plain":["'Yes, probability is a class topic in mathematics that deals with the likelihood of events occurring. It is often taught in middle school or high school as part of the curriculum. Thanks for asking!'"]},"execution_count":8,"metadata":{},"output_type":"execute_result"}],"source":["# 初始化一个 prompt 模板,创建一个检索 QA 链,然后传入一个问题并得到一个结果。\n","# 构建 prompt\n","from langchain.prompts import PromptTemplate\n","template = \"\"\"Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say \"thanks for asking!\" at the end of the answer. \n","{context}\n","Question: {question}\n","Helpful Answer:\"\"\"\n","QA_CHAIN_PROMPT = PromptTemplate(input_variables=[\"context\", \"question\"],template=template,)\n","\n","# 运行 chain\n","from langchain.chains.retrieval_qa.base import RetrievalQA\n","question = \"Is probability a class topic?\"\n","qa_chain = RetrievalQA.from_chain_type(llm,\n"," retriever=vectordb.as_retriever(),\n"," return_source_documents=True,\n"," chain_type_kwargs={\"prompt\": QA_CHAIN_PROMPT})\n","\n","\n","result = qa_chain({\"query\": question})\n","result[\"result\"]"]},{"cell_type":"code","execution_count":9,"id":"b8aa6c11","metadata":{"height":351,"tags":[]},"outputs":[{"data":{"text/plain":["'是的,概率是我们这周的课堂主题。\\n谢谢你的提问!'"]},"execution_count":9,"metadata":{},"output_type":"execute_result"}],"source":["# 构建 prompt\n","from langchain.prompts import PromptTemplate\n","template = \"\"\"使用以下上下文来回答最后的问题。如果你不知道答案,就说你不知道,不要试图编造答案。最多使用三句话。尽量使答案简明扼要。总是在回答的最后说“谢谢你的提问!”。\n","{context}\n","问题: {question}\n","有用的回答:\"\"\"\n","QA_CHAIN_PROMPT = PromptTemplate(input_variables=[\"context\", \"question\"],template=template,)\n","\n","# 运行 chain\n","from langchain.chains import RetrievalQA\n","question = \"概率是课堂主题吗?\"\n","qa_chain = RetrievalQA.from_chain_type(llm,\n"," retriever=vectordb.as_retriever(),\n"," return_source_documents=True,\n"," chain_type_kwargs={\"prompt\": QA_CHAIN_PROMPT})\n","\n","\n","result = qa_chain({\"query\": question})\n","result[\"result\"]"]},{"cell_type":"markdown","id":"84f8d2e2","metadata":{},"source":["## 二、记忆(Memory)"]},{"cell_type":"markdown","id":"5263e1ed","metadata":{"height":30},"source":["现在让我们更进一步,添加一些记忆功能。\n","\n","我们将使用 `ConversationBufferMemory`。它保存聊天消息历史记录的列表,这些历史记录将在回答问题时与问题一起传递给聊天机器人,从而将它们添加到上下文中。\n","\n","需要注意的是,我们之前讨论的上下文检索等方法,在这里同样可用。 "]},{"cell_type":"code","execution_count":10,"id":"7e99e754","metadata":{"height":113,"tags":[]},"outputs":[],"source":["from langchain.memory import ConversationBufferMemory\n","memory = ConversationBufferMemory(\n"," memory_key=\"chat_history\", # 与 prompt 的输入变量保持一致。\n"," return_messages=True # 将以消息列表的形式返回聊天记录,而不是单个字符串\n",")"]},{"cell_type":"markdown","id":"244e1b7a","metadata":{},"source":["## 三、对话检索链(ConversationalRetrievalChain)"]},{"cell_type":"markdown","id":"39700b1c","metadata":{"height":30},"source":["在检索 QA 链的基础上,对话检索链添加了新步骤,将历史记录和新问题压缩为一个独立的问题,并在向量库中进行相关文档的查找。此外,当有多个答案时,内存记录下所有答案。我们在运行和观察其效果后,可以在UI中查看具体细节。 \n","\n","对话检索链(ConversationalRetrievalChain)在 QA 检索链的基础上增加了一个新的环节,而不仅仅是 memery。\n","\n","具体来说,它增加了一个步骤,该步骤将历史记录和新问题浓缩为一个独立的问题,并将其传递给向量存储器以查找相关文档。\n","\n","\n","我们将在运行后在用户界面中查看其效果。但现在,让我们来试试。我们可以提出一个问题。这是没有任何历史记录的,看看我们得到的结果。然后我们可以针对答案提出一个后续问题。这和之前一样。所以我们问,概率是一个类主题吗?我们会得到一些答案。"]},{"cell_type":"code","execution_count":11,"id":"23700502","metadata":{"height":147,"tags":[]},"outputs":[],"source":["from langchain.chains.conversational_retrieval.base import ConversationalRetrievalChain\n","retriever=vectordb.as_retriever()\n","qa = ConversationalRetrievalChain.from_llm(\n"," llm,\n"," retriever=retriever,\n"," memory=memory\n",")"]},{"cell_type":"code","execution_count":12,"id":"e25b9fbb","metadata":{"height":62,"tags":[]},"outputs":[{"name":"stdout","output_type":"stream","text":["Yes, probability is a common topic covered in mathematics classes, particularly in courses like statistics, probability theory, and sometimes even in introductory math classes. It is an important concept in understanding uncertainty and making predictions based on data.\n"]}],"source":["question = \"Is probability a class topic?\"\n","result = qa({\"question\": question})\n","print(result['answer'])"]},{"cell_type":"code","execution_count":13,"id":"622478ba","metadata":{"height":69,"scrolled":true,"tags":[]},"outputs":[{"name":"stdout","output_type":"stream","text":["概率通常在高中数学课程中被涵盖,也会在大学的统计学、概率论、数理统计等相关课程中深入讨论。\n"]}],"source":["question = \"概率是一个课堂主题吗?\"\n","result = qa({\"question\": question})\n","print(result['answer'])"]},{"cell_type":"code","execution_count":14,"id":"5b280672","metadata":{"height":62,"tags":[]},"outputs":[{"name":"stdout","output_type":"stream","text":["Prerequisites are needed for studying probability because probability involves complex mathematical concepts and calculations. To understand probability theory effectively, students need a solid foundation in areas such as algebra, calculus, and statistics. Without these prerequisites, it can be challenging to grasp the advanced concepts and applications of probability.\n"]}],"source":["question = \"why are those prerequesites needed?\"\n","result = qa({\"question\": question})\n","print(result['answer'])"]},{"cell_type":"code","execution_count":15,"id":"fcd08a60","metadata":{"height":70,"tags":[]},"outputs":[{"name":"stdout","output_type":"stream","text":["这些先决条件可能包括对基本数学概念的理解,如概率、统计和逻辑等。这些知识可以帮助学习者更好地理解概率的概念和应用。\n"]}],"source":["question = \"为什么需要这些先决条件?\"\n","result = qa({\"question\": question})\n","print(result['answer'])"]},{"cell_type":"markdown","id":"31e0742e-2085-4635-b87d-6596925f1077","metadata":{"tags":[]},"source":["## 四、创建一个适用于您文档的聊天机器人"]},{"cell_type":"code","execution_count":16,"id":"8622a19c-0711-49aa-8d1d-9ddfb6af7032","metadata":{"height":181,"tags":[]},"outputs":[],"source":["from langchain.embeddings.openai import OpenAIEmbeddings\n","from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter\n","from langchain.vectorstores import DocArrayInMemorySearch\n","from langchain.document_loaders import TextLoader\n","from langchain.chains.retrieval_qa.base import RetrievalQA\n","from langchain.memory import ConversationBufferMemory\n","from langchain.chat_models.openai import ChatOpenAI\n","from langchain.document_loaders import TextLoader\n","from langchain.document_loaders import PyPDFLoader"]},{"cell_type":"markdown","id":"95f69afe","metadata":{},"source":["跟视频拍摄相比,聊天机器人代码已进行了一些更新。图形用户界面的外观也因运行平台而异。"]},{"cell_type":"code","execution_count":17,"id":"69dff120","metadata":{"height":404},"outputs":[],"source":["def load_db(file, chain_type, k):\n"," \"\"\"\n"," 该函数用于加载 PDF 文件,切分文档,生成文档的嵌入向量,创建向量数据库,定义检索器,并创建聊天机器人实例。\n","\n"," 参数:\n"," file (str): 要加载的 PDF 文件路径。\n"," chain_type (str): 链类型,用于指定聊天机器人的类型。\n"," k (int): 在检索过程中,返回最相似的 k 个结果。\n","\n"," 返回:\n"," qa (ConversationalRetrievalChain): 创建的聊天机器人实例。\n"," \"\"\"\n"," # 载入文档\n"," loader = PyPDFLoader(file)\n"," documents = loader.load()\n"," # 切分文档\n"," text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)\n"," docs = text_splitter.split_documents(documents)\n"," # 定义 Embeddings\n"," embeddings = OpenAIEmbeddings(model='text-embedding-3-small')\n"," # 根据数据创建向量数据库\n"," db = DocArrayInMemorySearch.from_documents(docs, embeddings)\n"," # 定义检索器\n"," retriever = db.as_retriever(search_type=\"similarity\", search_kwargs={\"k\": k})\n"," # 创建 chatbot 链,Memory 由外部管理\n"," qa = ConversationalRetrievalChain.from_llm(\n"," llm=ChatOpenAI(model_name=llm_name, temperature=0), \n"," chain_type=chain_type, \n"," retriever=retriever, \n"," return_source_documents=True,\n"," return_generated_question=True,\n"," )\n"," return qa \n"]},{"cell_type":"code","execution_count":22,"id":"b525019d-3acd-40ff-9844-141116155d68","metadata":{"height":1320,"tags":[]},"outputs":[],"source":["import panel as pn\n","import param\n","\n","# 用于存储聊天记录、回答、数据库查询和回复\n","class cbfs(param.Parameterized):\n"," chat_history = param.List([])\n"," answer = param.String(\"\")\n"," db_query = param.String(\"\")\n"," db_response = param.List([])\n"," \n"," def __init__(self, **params):\n"," super(cbfs, self).__init__( **params)\n"," self.panels = []\n"," self.loaded_file = \"docs/cs229_lectures/MachineLearning-Lecture01.pdf\"\n"," self.qa = load_db(self.loaded_file,\"stuff\", 4)\n"," \n"," # 将文档加载到聊天机器人中\n"," def call_load_db(self, count):\n"," \"\"\"\n"," count: 数量\n"," \"\"\"\n"," if count == 0 or file_input.value is None: # 初始化或未指定文件 :\n"," return pn.pane.Markdown(f\"Loaded File: {self.loaded_file}\")\n"," else:\n"," file_input.save(\"temp.pdf\") # 本地副本\n"," self.loaded_file = file_input.filename\n"," button_load.button_style=\"outline\"\n"," self.qa = load_db(\"temp.pdf\", \"stuff\", 4)\n"," button_load.button_style=\"solid\"\n"," self.clr_history()\n"," return pn.pane.Markdown(f\"Loaded File: {self.loaded_file}\")\n","\n"," # 处理对话链\n"," def convchain(self, query):\n"," \"\"\"\n"," query: 用户的查询\n"," \"\"\"\n"," if not query:\n"," return pn.WidgetBox(pn.Row('User:', pn.pane.Markdown(\"\", width=600)), scroll=True)\n"," result = self.qa({\"question\": query, \"chat_history\": self.chat_history})\n"," self.chat_history.extend([(query, result[\"answer\"])])\n"," self.db_query = result[\"generated_question\"]\n"," self.db_response = result[\"source_documents\"]\n"," self.answer = result['answer'] \n"," self.panels.extend([\n"," pn.Row('User:', pn.pane.Markdown(query, width=600)),\n"," pn.Row('ChatBot:', pn.pane.Markdown(self.answer, width=600, styles={'background-color': '#F6F6F6'}))\n"," ])\n"," inp.value = '' # 清除时清除装载指示器\n"," return pn.WidgetBox(*self.panels,scroll=True)\n"," \n"," # 获取最后发送到数据库的问题\n"," @param.depends('db_query ', )\n"," def get_lquest(self):\n"," if not self.db_query :\n"," return pn.Column(\n"," pn.Row(pn.pane.Markdown(f\"Last question to DB:\", styles={'background-color': '#F6F6F6'})),\n"," pn.Row(pn.pane.Str(\"no DB accesses so far\"))\n"," )\n"," return pn.Column(\n"," pn.Row(pn.pane.Markdown(f\"DB query:\", styles={'background-color': '#F6F6F6'})),\n"," pn.pane.Str(self.db_query )\n"," )\n"," \n"," # 获取数据库返回的源文件\n"," @param.depends('db_response', )\n"," def get_sources(self):\n"," if not self.db_response:\n"," return \n"," rlist=[pn.Row(pn.pane.Markdown(f\"Result of DB lookup:\", styles={'background-color': '#F6F6F6'}))]\n"," for doc in self.db_response:\n"," rlist.append(pn.Row(pn.pane.Str(doc)))\n"," return pn.WidgetBox(*rlist, width=600, scroll=True)\n","\n"," # 获取当前聊天记录\n"," @param.depends('convchain', 'clr_history') \n"," def get_chats(self):\n"," if not self.chat_history:\n"," return pn.WidgetBox(pn.Row(pn.pane.Str(\"No History Yet\")), width=600, scroll=True)\n"," rlist=[pn.Row(pn.pane.Markdown(f\"Current Chat History variable\", styles={'background-color': '#F6F6F6'}))]\n"," for exchange in self.chat_history:\n"," rlist.append(pn.Row(pn.pane.Str(exchange)))\n"," return pn.WidgetBox(*rlist, width=600, scroll=True)\n"," \n"," # 清除聊天记录\n"," def clr_history(self,count=0):\n"," self.chat_history = []\n"," return \n"]},{"cell_type":"markdown","id":"54958d9d-2a0f-457d-9a2b-97e6c9859684","metadata":{"tags":[]},"source":["## 五、创建聊天机器人"]},{"cell_type":"code","execution_count":23,"id":"73b7f691-6314-4fcc-9e48-0d09d61099fd","metadata":{"height":691,"tags":[]},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"7a87f31af0b9437f8785275a532855c2","version_major":2,"version_minor":0},"text/plain":["BokehModel(combine_events=True, render_bundle={'docs_json': {'377d369e-b052-4e1b-a745-ca48584505a1': {'version…"]},"execution_count":23,"metadata":{},"output_type":"execute_result"}],"source":["# 初始化聊天机器人\n","cb = cbfs() \n","\n","# 定义界面的小部件\n","file_input = pn.widgets.FileInput(accept='.pdf') # PDF 文件的文件输入小部件\n","button_load = pn.widgets.Button(name=\"Load DB\", button_type='primary') # 加载数据库的按钮\n","button_clearhistory = pn.widgets.Button(name=\"Clear History\", button_type='warning') # 清除聊天记录的按钮\n","button_clearhistory.on_click(cb.clr_history) # 将清除历史记录功能绑定到按钮上\n","inp = pn.widgets.TextInput( placeholder='Enter text here…') # 用于用户查询的文本输入小部件\n","\n","# 将加载数据库和对话的函数绑定到相应的部件上\n","bound_button_load = pn.bind(cb.call_load_db, button_load.param.clicks)\n","conversation = pn.bind(cb.convchain, inp) \n","\n","jpg_pane = pn.pane.Image( './img/convchain.jpg')\n","\n","# 使用 Panel 定义界面布局\n","tab1 = pn.Column(\n"," pn.Row(inp),\n"," pn.layout.Divider(),\n"," pn.panel(conversation, loading_indicator=True, height=300),\n"," pn.layout.Divider(),\n",")\n","tab2= pn.Column(\n"," pn.panel(cb.get_lquest),\n"," pn.layout.Divider(),\n"," pn.panel(cb.get_sources ),\n",")\n","tab3= pn.Column(\n"," pn.panel(cb.get_chats),\n"," pn.layout.Divider(),\n",")\n","tab4=pn.Column(\n"," pn.Row( file_input, button_load, bound_button_load),\n"," pn.Row( button_clearhistory, pn.pane.Markdown(\"Clears chat history. Can use to start a new topic\" )),\n"," pn.layout.Divider(),\n"," pn.Row(jpg_pane.clone(width=400))\n",")\n","# 将所有选项卡合并为一个仪表盘\n","dashboard = pn.Column(\n"," pn.Row(pn.pane.Markdown('# ChatWithYourData_Bot')),\n"," pn.Tabs(('Conversation', tab1), ('Database', tab2), ('Chat History', tab3),('Configure', tab4))\n",")\n","dashboard"]},{"cell_type":"markdown","id":"d2d2a016","metadata":{"height":30},"source":["请随意复制此代码并对其进行修改以添加您自己的功能。您可以通过更改 `load_db` 函数和 `convchain` 方法中的配置来尝试备用内存和检索器模型。[panel](https://panel.holoviz.org/) 和 [Param](https://param.holoviz.org/) 有许多有用的特性和小部件,可以用来扩展GUI。"]},{"cell_type":"markdown","id":"d0275b31-b5e3-4e15-a77f-1bf30c3e04e4","metadata":{},"source":["## 六、致谢\n","\n","基于 panel 的聊天机器人的灵感来自 Sophia Yang, [github](https://github.com/sophiamyang/tutorials-LangChain)"]}],"metadata":{"kernelspec":{"display_name":"Python 3 (ipykernel)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.9"}},"nbformat":4,"nbformat_minor":5}
+{"cells":[{"cell_type":"markdown","id":"956ff3a4","metadata":{},"source":["# 第七章、聊天 Chat\n","\n"," - [一、环境配置](#一、环境配置)\n"," - [二、记忆(Memory)](#二、记忆(Memory))\n"," - [三、对话检索链(ConversationalRetrievalChain)](#三、对话检索链(ConversationalRetrievalChain))\n"," - [四、创建一个适用于您文档的聊天机器人](#四、创建一个适用于您文档的聊天机器人)\n"," - [五、创建聊天机器人](#五、创建聊天机器人)\n"," - [六、致谢](#六、致谢)\n"]},{"cell_type":"markdown","id":"6f2e2cd9","metadata":{},"source":["回想一下检索增强生成 (retrieval augmented generation,RAG) 的整体工作流程:"]},{"attachments":{"overview.jpeg":{"image/jpeg":""}},"cell_type":"markdown","id":"231abd66","metadata":{},"source":[""]},{"cell_type":"markdown","id":"5e6a4d0c","metadata":{},"source":["我们已经接近完成一个功能性的聊天机器人了。我们讨论了`文档加载`、`切分`、`存储`和`检索`。我们展示了如何使用`检索 QA`链在 Q+A 中使用`检索`生成输出。\n","\n","我们的机器人已经可以回答问题了,但还无法处理后续问题,无法进行真正的对话。好消息是,在本章中,我们将解决这个问题。\n","\n","我们现在将创建一个问答聊天机器人。它与之前非常相似,但我们将添加聊天历史的功能。这是您之前进行的任何对话或消息。这将使机器人在尝试回答问题时能够考虑到聊天历史的上下文。所以,如果您继续提问,它会知道您想谈论什么。"]},{"cell_type":"markdown","id":"1f698037","metadata":{},"source":["## 一、环境配置"]},{"cell_type":"code","execution_count":null,"id":"926fc482-dd03-40e1-8083-171aad3e2b26","metadata":{"height":232,"tags":[]},"outputs":[],"source":["import os\n","import openai\n","import sys\n","sys.path.append('../..')\n","\n","import panel as pn # GUI\n","pn.extension()\n","\n","from dotenv import load_dotenv, find_dotenv\n","_ = load_dotenv(find_dotenv()) # read local .env file\n","\n","openai.api_key = os.environ['OPENAI_API_KEY']"]},{"cell_type":"markdown","id":"f71bd908","metadata":{"height":30},"source":["以下代码是为了 openai LLM 版本备案,直至其被弃用(于 2023 年 9 月)。LLM 响应通常会有所不同,但在使用不同模型版本时,这种差异可能会更明显。"]},{"cell_type":"code","execution_count":2,"id":"83558969-e5aa-4fbc-933c-9c1534decf0a","metadata":{"height":147,"tags":[]},"outputs":[{"name":"stdout","output_type":"stream","text":["gpt-3.5-turbo-0301\n"]}],"source":["import datetime\n","current_date = datetime.datetime.now().date()\n","if current_date < datetime.date(2023, 9, 2):\n"," llm_name = \"gpt-3.5-turbo-0301\"\n","else:\n"," llm_name = \"gpt-3.5-turbo\"\n","print(llm_name)"]},{"cell_type":"markdown","id":"baddf039","metadata":{},"source":["如果您想在 `Lang Chain plus` 平台上进行实验:\n","\n","- 前往 langchain plus 平台并注册\n","\n","- 从您的帐户设置创建 api 密钥\n","\n","- 在下面的代码中使用此 api 密钥"]},{"cell_type":"code","execution_count":null,"id":"ba19ea7d","metadata":{"height":96,"tags":[]},"outputs":[],"source":["#import os\n","#os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n","#os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.langchain.plus\"\n","#os.environ[\"LANGCHAIN_API_KEY\"] = \"...\""]},{"cell_type":"code","execution_count":3,"id":"ab80dc86","metadata":{"height":113,"tags":[]},"outputs":[],"source":["# 加载向量库,其中包含了所有课程材料的 Embedding。\n","from langchain.vectorstores import Chroma\n","from langchain.embeddings.openai import OpenAIEmbeddings\n","persist_directory = 'docs/chroma/'\n","embedding = OpenAIEmbeddings()\n","vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)"]},{"cell_type":"code","execution_count":5,"id":"b7a409bf","metadata":{"height":79,"tags":[]},"outputs":[{"data":{"text/plain":["3"]},"execution_count":5,"metadata":{},"output_type":"execute_result"}],"source":["# 对向量库进行基本的相似度搜索\n","question = \"What are major topics for this class?\"\n","docs = vectordb.similarity_search(question,k=3)\n","len(docs)"]},{"cell_type":"code","execution_count":4,"id":"d59d01b0","metadata":{"height":79,"tags":[]},"outputs":[{"data":{"text/plain":["3"]},"execution_count":4,"metadata":{},"output_type":"execute_result"}],"source":["question = \"这门课的主要内容是什么?\"\n","docs = vectordb.similarity_search(question,k=3)\n","len(docs)"]},{"cell_type":"code","execution_count":5,"id":"01c58c2b","metadata":{"height":79,"tags":[]},"outputs":[{"data":{"text/plain":["'Hello there! How can I assist you today?'"]},"execution_count":5,"metadata":{},"output_type":"execute_result"}],"source":["from langchain.chat_models import ChatOpenAI\n","llm = ChatOpenAI(model_name=llm_name, temperature=0)\n","llm.predict(\"Hello world!\")"]},{"cell_type":"code","execution_count":7,"id":"d683c6ed","metadata":{"height":336},"outputs":[{"data":{"text/plain":["'Yes, probability is assumed to be a prerequisite for this class. The instructor assumes familiarity with basic probability and statistics, and will go over some of the prerequisites in the discussion sections as a refresher course. Thanks for asking!'"]},"execution_count":7,"metadata":{},"output_type":"execute_result"}],"source":["# 初始化一个 prompt 模板,创建一个检索 QA 链,然后传入一个问题并得到一个结果。\n","# 构建 prompt\n","from langchain.prompts import PromptTemplate\n","template = \"\"\"Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say \"thanks for asking!\" at the end of the answer. \n","{context}\n","Question: {question}\n","Helpful Answer:\"\"\"\n","QA_CHAIN_PROMPT = PromptTemplate(input_variables=[\"context\", \"question\"],template=template,)\n","\n","# 运行 chain\n","from langchain.chains import RetrievalQA\n","question = \"Is probability a class topic?\"\n","qa_chain = RetrievalQA.from_chain_type(llm,\n"," retriever=vectordb.as_retriever(),\n"," return_source_documents=True,\n"," chain_type_kwargs={\"prompt\": QA_CHAIN_PROMPT})\n","\n","\n","result = qa_chain({\"query\": question})\n","result[\"result\"]"]},{"cell_type":"code","execution_count":6,"id":"b8aa6c11","metadata":{"height":351,"tags":[]},"outputs":[{"data":{"text/plain":["'概率是课堂主题之一,但这门课程不会过多涉及编程。需要对基本概率和统计学有一定的了解,以及基本的线性代数知识。谢谢你的提问!'"]},"execution_count":6,"metadata":{},"output_type":"execute_result"}],"source":["# 构建 prompt\n","from langchain.prompts import PromptTemplate\n","template = \"\"\"使用以下上下文来回答最后的问题。如果你不知道答案,就说你不知道,不要试图编造答案。最多使用三句话。尽量使答案简明扼要。总是在回答的最后说“谢谢你的提问!”。\n","{context}\n","问题: {question}\n","有用的回答:\"\"\"\n","QA_CHAIN_PROMPT = PromptTemplate(input_variables=[\"context\", \"question\"],template=template,)\n","\n","# 运行 chain\n","from langchain.chains import RetrievalQA\n","question = \"概率是课堂主题吗?\"\n","qa_chain = RetrievalQA.from_chain_type(llm,\n"," retriever=vectordb.as_retriever(),\n"," return_source_documents=True,\n"," chain_type_kwargs={\"prompt\": QA_CHAIN_PROMPT})\n","\n","\n","result = qa_chain({\"query\": question})\n","result[\"result\"]"]},{"cell_type":"markdown","id":"84f8d2e2","metadata":{},"source":["## 二、记忆(Memory)"]},{"cell_type":"markdown","id":"5263e1ed","metadata":{"height":30},"source":["现在让我们更进一步,添加一些记忆功能。\n","\n","我们将使用 `ConversationBufferMemory`。它保存聊天消息历史记录的列表,这些历史记录将在回答问题时与问题一起传递给聊天机器人,从而将它们添加到上下文中。\n","\n","需要注意的是,我们之前讨论的上下文检索等方法,在这里同样可用。 "]},{"cell_type":"code","execution_count":7,"id":"7e99e754","metadata":{"height":113,"tags":[]},"outputs":[],"source":["from langchain.memory import ConversationBufferMemory\n","memory = ConversationBufferMemory(\n"," memory_key=\"chat_history\", # 与 prompt 的输入变量保持一致。\n"," return_messages=True # 将以消息列表的形式返回聊天记录,而不是单个字符串\n",")"]},{"cell_type":"markdown","id":"244e1b7a","metadata":{},"source":["## 三、对话检索链(ConversationalRetrievalChain)"]},{"cell_type":"markdown","id":"39700b1c","metadata":{"height":30},"source":["在检索 QA 链的基础上,对话检索链添加了新步骤,将历史记录和新问题压缩为一个独立的问题,并在向量库中进行相关文档的查找。此外,当有多个答案时,内存记录下所有答案。我们在运行和观察其效果后,可以在UI中查看具体细节。 \n","\n","对话检索链(ConversationalRetrievalChain)在 QA 检索链的基础上增加了一个新的环节,而不仅仅是 memery。\n","\n","具体来说,它增加了一个步骤,该步骤将历史记录和新问题浓缩为一个独立的问题,并将其传递给向量存储器以查找相关文档。\n","\n","\n","我们将在运行后在用户界面中查看其效果。但现在,让我们来试试。我们可以提出一个问题。这是没有任何历史记录的,看看我们得到的结果。然后我们可以针对答案提出一个后续问题。这和之前一样。所以我们问,概率是一个类主题吗?我们会得到一些答案。"]},{"cell_type":"code","execution_count":8,"id":"23700502","metadata":{"height":147,"tags":[]},"outputs":[],"source":["from langchain.chains import ConversationalRetrievalChain\n","retriever=vectordb.as_retriever()\n","qa = ConversationalRetrievalChain.from_llm(\n"," llm,\n"," retriever=retriever,\n"," memory=memory\n",")"]},{"cell_type":"code","execution_count":10,"id":"e25b9fbb","metadata":{"height":62,"tags":[]},"outputs":[{"name":"stdout","output_type":"stream","text":["Yes, probability is a topic that will be assumed to be familiar to students in this class. The instructor assumes that students have a basic understanding of probability and statistics, and will go over some of the prerequisites as a refresher course in the discussion sections.\n"]}],"source":["question = \"Is probability a class topic?\"\n","result = qa({\"question\": question})\n","print(result['answer'])"]},{"cell_type":"code","execution_count":9,"id":"622478ba","metadata":{"height":69,"scrolled":true,"tags":[]},"outputs":[{"name":"stdout","output_type":"stream","text":["是的,根据文本中的描述,概率是这门课程的一个基本主题,因此学生需要对基本的概率和统计知识有所了解。\n"]}],"source":["question = \"概率是一个课堂主题吗?\"\n","result = qa({\"question\": question})\n","print(result['answer'])"]},{"cell_type":"code","execution_count":11,"id":"5b280672","metadata":{"height":62,"tags":[]},"outputs":[{"name":"stdout","output_type":"stream","text":["The reason for needing those prerequisites in the class is that the instructor assumes that all students have a basic knowledge of computer science and knowledge of the basic computer skills and principles, such as knowing what big-O notation is.\n"]}],"source":["question = \"why are those prerequesites needed?\"\n","result = qa({\"question\": question})\n","print(result['answer'])"]},{"cell_type":"code","execution_count":10,"id":"fcd08a60","metadata":{"height":70,"tags":[]},"outputs":[{"name":"stdout","output_type":"stream","text":["学生需要对基本的概率和统计知识有所了解,因为这些知识是机器学习的基础。在机器学习中,我们需要使用概率和统计来理解和解决各种问题,例如分类、回归、聚类等。如果学生没有这些基础知识,将很难理解机器学习的概念和方法。此外,对于那些已经学过概率和统计的学生,这门课程将帮助他们将这些知识应用到机器学习中。\n"]}],"source":["question = \"为什么需要这些先决条件?\"\n","result = qa({\"question\": question})\n","print(result['answer'])"]},{"cell_type":"markdown","id":"31e0742e-2085-4635-b87d-6596925f1077","metadata":{"tags":[]},"source":["## 四、创建一个适用于您文档的聊天机器人"]},{"cell_type":"code","execution_count":12,"id":"8622a19c-0711-49aa-8d1d-9ddfb6af7032","metadata":{"height":181,"tags":[]},"outputs":[],"source":["from langchain.embeddings.openai import OpenAIEmbeddings\n","from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter\n","from langchain.vectorstores import DocArrayInMemorySearch\n","from langchain.document_loaders import TextLoader\n","from langchain.chains import RetrievalQA, ConversationalRetrievalChain\n","from langchain.memory import ConversationBufferMemory\n","from langchain.chat_models import ChatOpenAI\n","from langchain.document_loaders import TextLoader\n","from langchain.document_loaders import PyPDFLoader"]},{"cell_type":"markdown","id":"95f69afe","metadata":{},"source":["跟视频拍摄相比,聊天机器人代码已进行了一些更新。图形用户界面的外观也因运行平台而异。"]},{"cell_type":"code","execution_count":13,"id":"69dff120","metadata":{"height":404},"outputs":[],"source":["def load_db(file, chain_type, k):\n"," \"\"\"\n"," 该函数用于加载 PDF 文件,切分文档,生成文档的嵌入向量,创建向量数据库,定义检索器,并创建聊天机器人实例。\n","\n"," 参数:\n"," file (str): 要加载的 PDF 文件路径。\n"," chain_type (str): 链类型,用于指定聊天机器人的类型。\n"," k (int): 在检索过程中,返回最相似的 k 个结果。\n","\n"," 返回:\n"," qa (ConversationalRetrievalChain): 创建的聊天机器人实例。\n"," \"\"\"\n"," # 载入文档\n"," loader = PyPDFLoader(file)\n"," documents = loader.load()\n"," # 切分文档\n"," text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)\n"," docs = text_splitter.split_documents(documents)\n"," # 定义 Embeddings\n"," embeddings = OpenAIEmbeddings()\n"," # 根据数据创建向量数据库\n"," db = DocArrayInMemorySearch.from_documents(docs, embeddings)\n"," # 定义检索器\n"," retriever = db.as_retriever(search_type=\"similarity\", search_kwargs={\"k\": k})\n"," # 创建 chatbot 链,Memory 由外部管理\n"," qa = ConversationalRetrievalChain.from_llm(\n"," llm=ChatOpenAI(model_name=llm_name, temperature=0), \n"," chain_type=chain_type, \n"," retriever=retriever, \n"," return_source_documents=True,\n"," return_generated_question=True,\n"," )\n"," return qa \n"]},{"cell_type":"code","execution_count":14,"id":"b525019d-3acd-40ff-9844-141116155d68","metadata":{"height":1320,"tags":[]},"outputs":[],"source":["import panel as pn\n","import param\n","\n","# 用于存储聊天记录、回答、数据库查询和回复\n","class cbfs(param.Parameterized):\n"," chat_history = param.List([])\n"," answer = param.String(\"\")\n"," db_query = param.String(\"\")\n"," db_response = param.List([])\n"," \n"," def __init__(self, **params):\n"," super(cbfs, self).__init__( **params)\n"," self.panels = []\n"," self.loaded_file = \"docs/cs229_lectures/MachineLearning-Lecture01.pdf\"\n"," self.qa = load_db(self.loaded_file,\"stuff\", 4)\n"," \n"," # 将文档加载到聊天机器人中\n"," def call_load_db(self, count):\n"," \"\"\"\n"," count: 数量\n"," \"\"\"\n"," if count == 0 or file_input.value is None: # 初始化或未指定文件 :\n"," return pn.pane.Markdown(f\"Loaded File: {self.loaded_file}\")\n"," else:\n"," file_input.save(\"temp.pdf\") # 本地副本\n"," self.loaded_file = file_input.filename\n"," button_load.button_style=\"outline\"\n"," self.qa = load_db(\"temp.pdf\", \"stuff\", 4)\n"," button_load.button_style=\"solid\"\n"," self.clr_history()\n"," return pn.pane.Markdown(f\"Loaded File: {self.loaded_file}\")\n","\n"," # 处理对话链\n"," def convchain(self, query):\n"," \"\"\"\n"," query: 用户的查询\n"," \"\"\"\n"," if not query:\n"," return pn.WidgetBox(pn.Row('User:', pn.pane.Markdown(\"\", width=600)), scroll=True)\n"," result = self.qa({\"question\": query, \"chat_history\": self.chat_history})\n"," self.chat_history.extend([(query, result[\"answer\"])])\n"," self.db_query = result[\"generated_question\"]\n"," self.db_response = result[\"source_documents\"]\n"," self.answer = result['answer'] \n"," self.panels.extend([\n"," pn.Row('User:', pn.pane.Markdown(query, width=600)),\n"," pn.Row('ChatBot:', pn.pane.Markdown(self.answer, width=600, style={'background-color': '#F6F6F6'}))\n"," ])\n"," inp.value = '' # 清除时清除装载指示器\n"," return pn.WidgetBox(*self.panels,scroll=True)\n"," \n"," # 获取最后发送到数据库的问题\n"," @param.depends('db_query ', )\n"," def get_lquest(self):\n"," if not self.db_query :\n"," return pn.Column(\n"," pn.Row(pn.pane.Markdown(f\"Last question to DB:\", styles={'background-color': '#F6F6F6'})),\n"," pn.Row(pn.pane.Str(\"no DB accesses so far\"))\n"," )\n"," return pn.Column(\n"," pn.Row(pn.pane.Markdown(f\"DB query:\", styles={'background-color': '#F6F6F6'})),\n"," pn.pane.Str(self.db_query )\n"," )\n"," \n"," # 获取数据库返回的源文件\n"," @param.depends('db_response', )\n"," def get_sources(self):\n"," if not self.db_response:\n"," return \n"," rlist=[pn.Row(pn.pane.Markdown(f\"Result of DB lookup:\", styles={'background-color': '#F6F6F6'}))]\n"," for doc in self.db_response:\n"," rlist.append(pn.Row(pn.pane.Str(doc)))\n"," return pn.WidgetBox(*rlist, width=600, scroll=True)\n","\n"," # 获取当前聊天记录\n"," @param.depends('convchain', 'clr_history') \n"," def get_chats(self):\n"," if not self.chat_history:\n"," return pn.WidgetBox(pn.Row(pn.pane.Str(\"No History Yet\")), width=600, scroll=True)\n"," rlist=[pn.Row(pn.pane.Markdown(f\"Current Chat History variable\", styles={'background-color': '#F6F6F6'}))]\n"," for exchange in self.chat_history:\n"," rlist.append(pn.Row(pn.pane.Str(exchange)))\n"," return pn.WidgetBox(*rlist, width=600, scroll=True)\n"," \n"," # 清除聊天记录\n"," def clr_history(self,count=0):\n"," self.chat_history = []\n"," return \n"]},{"cell_type":"markdown","id":"54958d9d-2a0f-457d-9a2b-97e6c9859684","metadata":{"tags":[]},"source":["## 五、创建聊天机器人"]},{"cell_type":"code","execution_count":15,"id":"73b7f691-6314-4fcc-9e48-0d09d61099fd","metadata":{"height":691,"tags":[]},"outputs":[{"data":{},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.holoviews_exec.v0+json":"","text/html":["\n",""],"text/plain":["Column\n"," [0] Row\n"," [0] Markdown(str)\n"," [1] Tabs\n"," [0] Column\n"," [0] Row\n"," [0] TextInput(placeholder='Enter text here…')\n"," [1] Divider()\n"," [2] ParamFunction(function, _pane=WidgetBox, defer_load=False, height=300, loading_indicator=True)\n"," [3] Divider()\n"," [1] Column\n"," [0] ParamMethod(method, _pane=Column, defer_load=False)\n"," [1] Divider()\n"," [2] ParamMethod(method, _pane=Str, defer_load=False)\n"," [2] Column\n"," [0] ParamMethod(method, _pane=WidgetBox, defer_load=False)\n"," [1] Divider()\n"," [3] Column\n"," [0] Row\n"," [0] FileInput(accept='.pdf')\n"," [1] Button(button_type='primary', name='Load DB')\n"," [2] ParamFunction(function, _pane=Markdown, defer_load=False)\n"," [1] Row\n"," [0] Button(button_type='warning', name='Clear History')\n"," [1] Markdown(str)\n"," [2] Divider()\n"," [3] Row\n"," [0] Image(str, width=400)"]},"execution_count":15,"metadata":{"application/vnd.holoviews_exec.v0+json":{"id":"747fb5bf-923d-4aee-a1ad-027155ab96d6"}},"output_type":"execute_result"}],"source":["# 初始化聊天机器人\n","cb = cbfs() \n","\n","# 定义界面的小部件\n","file_input = pn.widgets.FileInput(accept='.pdf') # PDF 文件的文件输入小部件\n","button_load = pn.widgets.Button(name=\"Load DB\", button_type='primary') # 加载数据库的按钮\n","button_clearhistory = pn.widgets.Button(name=\"Clear History\", button_type='warning') # 清除聊天记录的按钮\n","button_clearhistory.on_click(cb.clr_history) # 将清除历史记录功能绑定到按钮上\n","inp = pn.widgets.TextInput( placeholder='Enter text here…') # 用于用户查询的文本输入小部件\n","\n","# 将加载数据库和对话的函数绑定到相应的部件上\n","bound_button_load = pn.bind(cb.call_load_db, button_load.param.clicks)\n","conversation = pn.bind(cb.convchain, inp) \n","\n","jpg_pane = pn.pane.Image( './img/convchain.jpg')\n","\n","# 使用 Panel 定义界面布局\n","tab1 = pn.Column(\n"," pn.Row(inp),\n"," pn.layout.Divider(),\n"," pn.panel(conversation, loading_indicator=True, height=300),\n"," pn.layout.Divider(),\n",")\n","tab2= pn.Column(\n"," pn.panel(cb.get_lquest),\n"," pn.layout.Divider(),\n"," pn.panel(cb.get_sources ),\n",")\n","tab3= pn.Column(\n"," pn.panel(cb.get_chats),\n"," pn.layout.Divider(),\n",")\n","tab4=pn.Column(\n"," pn.Row( file_input, button_load, bound_button_load),\n"," pn.Row( button_clearhistory, pn.pane.Markdown(\"Clears chat history. Can use to start a new topic\" )),\n"," pn.layout.Divider(),\n"," pn.Row(jpg_pane.clone(width=400))\n",")\n","# 将所有选项卡合并为一个仪表盘\n","dashboard = pn.Column(\n"," pn.Row(pn.pane.Markdown('# ChatWithYourData_Bot')),\n"," pn.Tabs(('Conversation', tab1), ('Database', tab2), ('Chat History', tab3),('Configure', tab4))\n",")\n","dashboard"]},{"cell_type":"markdown","id":"d2d2a016","metadata":{"height":30},"source":["请随意复制此代码并对其进行修改以添加您自己的功能。您可以通过更改 `load_db` 函数和 `convchain` 方法中的配置来尝试备用内存和检索器模型。[panel](https://panel.holoviz.org/) 和 [Param](https://param.holoviz.org/) 有许多有用的特性和小部件,可以用来扩展GUI。"]},{"cell_type":"markdown","id":"d0275b31-b5e3-4e15-a77f-1bf30c3e04e4","metadata":{},"source":["## 六、致谢\n","\n","基于 panel 的聊天机器人的灵感来自 Sophia Yang, [github](https://github.com/sophiamyang/tutorials-LangChain)"]}],"metadata":{"kernelspec":{"display_name":"Python 3 (ipykernel)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.12"}},"nbformat":4,"nbformat_minor":5}
diff --git "a/content/\345\277\205\344\277\256\345\233\233-LangChain Chat with Your Data/requirements.txt" "b/content/\345\277\205\344\277\256\345\233\233-LangChain Chat with Your Data/requirements.txt"
deleted file mode 100644
index 929b8079..00000000
--- "a/content/\345\277\205\344\277\256\345\233\233-LangChain Chat with Your Data/requirements.txt"
+++ /dev/null
@@ -1,13 +0,0 @@
-chromadb==0.5.0
-docarray==0.40.0
-jupyter_bokeh==4.0.5
-lark==1.1.9
-langchain==0.2.3
-langchain_community==0.2.4
-openai==1.33.0
-panel==1.4.4
-pydub==0.25.1
-pypdf==4.2.0
-python-dotenv==1.0.1
-scikit-learn==1.5.0
-tiktoken==0.7.0
\ No newline at end of file
diff --git "a/content/\351\200\211\344\277\256-Functions, Tools and Agents with LangChain/1. \347\256\200\344\273\213 Introduction.md" "b/content/\351\200\211\344\277\256-Functions, Tools and Agents with LangChain/1. \347\256\200\344\273\213 Introduction.md"
deleted file mode 100644
index cd1a96fd..00000000
--- "a/content/\351\200\211\344\277\256-Functions, Tools and Agents with LangChain/1. \347\256\200\344\273\213 Introduction.md"
+++ /dev/null
@@ -1,26 +0,0 @@
-# 第一章 简介
-
-欢迎来到《使用 LangChain 的函数、工具和代理》课程,让我们一起开始本次学习!
-
-本教程由 LangChain 创始人 Harrison Chase 与 DeepLearning.AI 合作推出,将介绍如何使用 OpenAI 的 API 以及使用 LangChain 表达式语言(LCEL)来构建 Agent。
-
-## 一、背景
-
-随着大模型的快速发展,已经显示出了 LLM 使用自然语言与人类进行交互的强大能力。
-
-现在人们不仅限于用 LLM 来生成文本内容,而希望能与现有的软件基础设施进行交互,比如能够自主判断函数的调用,灵活地调用代码作为子程序。这样可以用来处理一些格式化的数据,这些数据有时候并不是很好的被 LLM 所理解,比如 JSON 格式数据的值,而不仅仅是单纯的文本。
-
-## 二、课程基本内容
-
-LangChain 是用于构建大模型应用程序的开源框架,搭建了一个开发者与 LLMs 之间的桥梁,并且能支持多个 LLMs 模型、向量存储等等功能,在前面的课程也有介绍过 LangChain。
-
-本次课程主要是顺应 LLMs 的潮流,如今 LangChain 与 ChatGPT 的 API 函数调用都有了新的更新,使用了一种名为`LangChain Expression Language(LCEL)`的新语法构建对话代理,用于标记、提取、工具选择和路由等任务。
-
-在本次课程中,主要内容如下
-
-1. 介绍 OpenAI 的 API 调用的新功能,能够格式化输出的格式。
-2. 学习 LangChain 表达语言 LCEL(Langchain Expression Language)的开发,简化了进行组合组件或链的过程。
-3. 学习如何将函数应用于标记和数据提取等任务
-4. 利用 LangChain 工具和 LLMs 函数构建一个功能强大的对话代理(Agent),能够灵活选择工具进行推理。
-
-通过使用这些工具和函数,我们可以让代理进行复杂的多步推理,并且自动选择工具来解决问题,增强 Agent 的智能,让我们一起探索 Agent 的强大能力吧!
diff --git "a/content/\351\200\211\344\277\256-Functions, Tools and Agents with LangChain/2. OpenAI\345\207\275\346\225\260\350\260\203\347\224\250 OpenAI Function Calling.ipynb" "b/content/\351\200\211\344\277\256-Functions, Tools and Agents with LangChain/2. OpenAI\345\207\275\346\225\260\350\260\203\347\224\250 OpenAI Function Calling.ipynb"
deleted file mode 100644
index 4b177a3a..00000000
--- "a/content/\351\200\211\344\277\256-Functions, Tools and Agents with LangChain/2. OpenAI\345\207\275\346\225\260\350\260\203\347\224\250 OpenAI Function Calling.ipynb"
+++ /dev/null
@@ -1,1259 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "7134fad1",
- "metadata": {
- "id": "7134fad1"
- },
- "source": [
- "# 第二章 OpenAI 函数调用 OpenAI Function Calling\n",
- "\n",
- "本章主要介绍 OpenAI 的 API 提供的新功能,在最近几个月,OpenAI 对一些新的模型进行了微调,分别是`gpt-3.5-turbo-0613`和`gpt-4-0613`。\n",
- "\n",
- "假如我们有一个工具函数,在一些特定情况下需要调用,经过微调后,这些模型能够传入新的参数,可以通过这个新的参数来自动判断是否调用工具函数,如果判断需要调用工具函数,会返回这个工具函数和对应的输入参数。\n",
- "\n",
- "\n",
- "- [一、OpenAI函数新参数](#一openai函数新参数)\n",
- " - [1.1 简单例子:得到当前天气](#11-简单例子得到当前天气) \n",
- " - [1.2 新参数:functions](#12-新参数functions) \n",
- " - [1.3 相关提示调用结果](#13-相关提示调用结果) \n",
- " - [1.4 无关提示调用结果](#14-无关提示调用结果)\n",
- "- [二、Function Call参数模式](#二function-call参数模式)\n",
- " - [2.1 自动判断是否调用](#21-自动判断是否调用) \n",
- " - [2.2 强制不调用](#22-强制不调用) \n",
- " - [2.3 强制调用](#23-强制调用) \n",
- "- [三、函数调用以及执行函数](#三函数调用以及执行函数) \n",
- "- [四、英文版提示](#四英文版提示)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "d0d4e112",
- "metadata": {},
- "source": [
- "## 一、OpenAI函数新参数\n",
- "\n",
- "首先我们直接定义使用`OPENAI_API_KEY`,方便后续去调用OpenAI的API接口,并利用其函数。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "68b323d9",
- "metadata": {
- "id": "68b323d9"
- },
- "outputs": [],
- "source": [
- "import os\n",
- "import openai\n",
- "\n",
- "os.environ['OPENAI_API_KEY'] = \"YOUR_API_KEY\"\n",
- "openai.api_key = os.environ['OPENAI_API_KEY']"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "fba14f39",
- "metadata": {
- "id": "fba14f39"
- },
- "source": [
- "### 1.1 简单例子:得到当前天气\n",
- "\n",
- "首先我们使用 OpenAI 的使用的第一个例子,定义了一个`get_current_weather`的函数,正常来说,获取当前天气是语言模型本身不能完全做到的事情。因此,我们希望能够将语言模型和这样的函数结合起来,以当前的信息来增强它。\n",
- "\n",
- "在这个函数,我们固定了返回的值,比如温度固定为 22 摄氏度,但在实际应用中,这可能涉及到调用天气 API 或一些外部知识源。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "id": "041aacb9",
- "metadata": {
- "id": "041aacb9"
- },
- "outputs": [],
- "source": [
- "import json\n",
- "\n",
- "# 定义一个函数,用于获取给定位置的当前天气\n",
- "def get_current_weather(location, unit=\"摄氏度\"):\n",
- " \"\"\"获取指定位置的当前天气\"\"\"\n",
- " # 模拟返回相同的天气情况的示例函数\n",
- " # 在实际应用环境中,这可以是天气API\n",
- " # 创建一个天气信息的字典\n",
- " weather_info = {\n",
- " \"location\": location, # 天气的位置\n",
- " \"temperature\": \"22\", # 温度\n",
- " \"unit\": unit, # 温度单位,默认为摄氏度\n",
- " \"forecast\": [\"晴\", \"多云\"], # 天气预报\n",
- " }\n",
- " # 将天气信息转换为JSON格式的字符串并返回\n",
- " return json.dumps(weather_info)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "a7ba1a92",
- "metadata": {
- "id": "a7ba1a92"
- },
- "source": [
- "### 1.2 新参数:functions\n",
- "\n",
- "那我们如何将这样的函数传给语言模型呢?OpenAI引入了一个名为`functions`的新参数,通过该参数,我们可以传递一个函数定义列表。由于我们只有一个函数,所以列表中只有一个元素。这是一个JSON对象,具有几个不同的参数。\n",
- "\n",
- "- name:函数的名称\n",
- "- description:函数的描述\n",
- "- parameters:参数对象,里面有一些属性设置\n",
- " - type:类型\n",
- " - properties:本身是一个对象,传入的是对应的参数的描述。在例子中我们可以看到有两个元素,`location`和`unit`。每个这些元素都有一个类型,即字符串,然后是一个描述。`unit`是一个外部参数的设置,比如这里我们希望它可以是摄氏度或华氏度,所以我们可以在这里定义他的类型和以及枚举参数值。\n",
- " - required:必填的参数,比如这里我们需要的参数就是`location`。\n",
- "\n",
- "在函数定义中,`description`以及在`properties`中的参数非常重要,因为这些将直接传递给语言模型,语言模型将使用这些描述来判断是否使用此函数。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "id": "d00a4ad8",
- "metadata": {
- "id": "d00a4ad8"
- },
- "outputs": [],
- "source": [
- "# 定义一个函数\n",
- "functions = [\n",
- " {\n",
- " \"name\": \"get_current_weather\",\n",
- " \"description\": \"获取指定位置的当前天气情况\",\n",
- " \"parameters\": {\n",
- " \"type\": \"object\",\n",
- " \"properties\": {\n",
- " \"location\": {\n",
- " \"type\": \"string\",\n",
- " \"description\": \"城市和省份,例如:北京,北京市\",\n",
- " },\n",
- " \"unit\": {\"type\": \"string\", \"enum\": [\"摄氏度\", \"华氏度\"]},\n",
- " },\n",
- " \"required\": [\"location\"],\n",
- " },\n",
- " }\n",
- "]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "ecfb5fd6",
- "metadata": {
- "id": "ecfb5fd6"
- },
- "source": [
- "### 1.3 相关提示调用结果\n",
- "\n",
- "接下来我们就可以定义一个有关于天气的问题,如”北京的天气怎么样?“,然后使用 OpenAI 的函数进行调用对话的 API。首先我们得选取较新的模型,如`gpt-3.5-turbo-0613`,然后我们再将上述定义的 function 函数传入,查看最后的响应结果。\n",
- "\n",
- "从结果上来看,返回的消息的角色是助手,内容为空,而是有一个函数调用参数`function_call`,其中包含两个对象,`name`和`arguments`。`Name`是`get_current_weather`,这与我们传递的函数的名称相同,然后`arguments`是这个 JSON 格式的字典,里面是我们需要的参数。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "6c9d6e3a",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{\n",
- " \"choices\": [\n",
- " {\n",
- " \"finish_reason\": \"function_call\",\n",
- " \"index\": 0,\n",
- " \"logprobs\": null,\n",
- " \"message\": {\n",
- " \"content\": null,\n",
- " \"function_call\": {\n",
- " \"arguments\": \"{\\n \\\"location\\\": \\\"\\u5317\\u4eac\\uff0c\\u5317\\u4eac\\u5e02\\\",\\n \\\"unit\\\": \\\"\\u6444\\u6c0f\\u5ea6\\\"\\n}\",\n",
- " \"name\": \"get_current_weather\"\n",
- " },\n",
- " \"role\": \"assistant\"\n",
- " }\n",
- " }\n",
- " ],\n",
- " \"created\": 1709602992,\n",
- " \"id\": \"chatcmpl-8zE7kIlpxzjsiuxH4q6wtRh8CLDH3\",\n",
- " \"model\": \"gpt-3.5-turbo-0613\",\n",
- " \"object\": \"chat.completion\",\n",
- " \"system_fingerprint\": null,\n",
- " \"usage\": {\n",
- " \"completion_tokens\": 30,\n",
- " \"prompt_tokens\": 95,\n",
- " \"total_tokens\": 125\n",
- " }\n",
- "}\n"
- ]
- }
- ],
- "source": [
- "import openai\n",
- "\n",
- "# 定义输入消息\n",
- "messages = [\n",
- " {\n",
- " \"role\": \"user\",\n",
- " \"content\": \"北京的天气怎么样?\"\n",
- " }\n",
- "]\n",
- "\n",
- "# 调用OpenAI的ChatCompletion API获取响应\n",
- "response = openai.ChatCompletion.create(\n",
- " model=\"gpt-3.5-turbo-0613\",\n",
- " messages=messages,\n",
- " functions=functions # 传入function参数\n",
- ")\n",
- "\n",
- "# 打印响应结果\n",
- "print(response)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "388c7dd5",
- "metadata": {},
- "source": [
- "我们也可以看到响应的参数,比如在这里就是两个参数,分别是`location`和`unit`。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "88851464",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{\n",
- " \"location\": \"北京,北京市\",\n",
- " \"unit\": \"摄氏度\"\n",
- "}\n"
- ]
- }
- ],
- "source": [
- "# 打印传入参数\n",
- "print(response[\"choices\"][0][\"message\"][\"function_call\"][\"arguments\"])"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "c04f92a6",
- "metadata": {
- "id": "c04f92a6"
- },
- "source": [
- "如果我们仔细看响应的话,我们会发现内容现在是空的,`function_call`是一个字典,`function_call`中的`arguments`参数本身也是一个 JSON 字典。因此我们可以使用`json.loads`将其加载到 Python 字典中。它返回的参数可以直接传递给我们上述定义的`get_current_weather`函数。\n",
- "\n",
- "我们会发现,使用 OpenAI 进行函数调用并不直接调用工具函数,我们还是需要去调用工具函数,它只是告诉我们要调用哪个函数,即名称,以及该函数的参数应该是什么。以及由于他并没有去执行函数,所以说,如果我们在使用`json.loads`解码的时候遇到了一些问题,那实际上是模型的问题,所以这一部分可以考虑在写工具函数的时候做一些保护措施,后续也会讨论这一个点。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "23c71606",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{\n",
- " \"content\": null,\n",
- " \"function_call\": {\n",
- " \"arguments\": \"{\\n \\\"location\\\": \\\"\\u5317\\u4eac\\uff0c\\u5317\\u4eac\\u5e02\\\",\\n \\\"unit\\\": \\\"\\u6444\\u6c0f\\u5ea6\\\"\\n}\",\n",
- " \"name\": \"get_current_weather\"\n",
- " },\n",
- " \"role\": \"assistant\"\n",
- "}\n"
- ]
- }
- ],
- "source": [
- "# 获取response里面的message信息\n",
- "response_message = response[\"choices\"][0][\"message\"]\n",
- "\n",
- "print(response_message)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "1f47fc65",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{\n",
- " \"location\": \"北京,北京市\",\n",
- " \"unit\": \"摄氏度\"\n",
- "}\n"
- ]
- }
- ],
- "source": [
- "# 打印参数\n",
- "print(response[\"choices\"][0][\"message\"][\"function_call\"][\"arguments\"])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "03007bf0",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'{\"location\": {\"location\": \"\\\\u5317\\\\u4eac\\\\uff0c\\\\u5317\\\\u4eac\\\\u5e02\", \"unit\": \"\\\\u6444\\\\u6c0f\\\\u5ea6\"}, \"temperature\": \"22\", \"unit\": \"\\\\u6444\\\\u6c0f\\\\u5ea6\", \"forecast\": [\"\\\\u6674\", \"\\\\u591a\\\\u4e91\"]}'"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "# 将JSON格式的字符串转换为Python对象\n",
- "args = json.loads(response_message[\"function_call\"][\"arguments\"])\n",
- "\n",
- "# 调用get_current_weather函数并传入参数args\n",
- "get_current_weather(args)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "31263afb",
- "metadata": {
- "id": "31263afb"
- },
- "source": [
- "### 1.4 无关提示调用结果\n",
- "\n",
- "接下来,我们也探讨一下,如果问的问题与函数无关会产生什么样的,也就是与天气无关,会返回什么样的信息呢?从结果上来看,我们可以返回的内容正常并且没有`function_call`参数,也就是在语言模型中判断不使用工具函数。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "4a1ea679",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{\n",
- " \"choices\": [\n",
- " {\n",
- " \"finish_reason\": \"stop\",\n",
- " \"index\": 0,\n",
- " \"logprobs\": null,\n",
- " \"message\": {\n",
- " \"content\": \"\\u4f60\\u597d\\uff01\\u6709\\u4ec0\\u4e48\\u6211\\u53ef\\u4ee5\\u5e2e\\u52a9\\u4f60\\u7684\\u5417\\uff1f\",\n",
- " \"role\": \"assistant\"\n",
- " }\n",
- " }\n",
- " ],\n",
- " \"created\": 1709602993,\n",
- " \"id\": \"chatcmpl-8zE7lZ3keV5JGVTFqvAWOGxit4cXj\",\n",
- " \"model\": \"gpt-3.5-turbo-0613\",\n",
- " \"object\": \"chat.completion\",\n",
- " \"system_fingerprint\": null,\n",
- " \"usage\": {\n",
- " \"completion_tokens\": 19,\n",
- " \"prompt_tokens\": 88,\n",
- " \"total_tokens\": 107\n",
- " }\n",
- "}\n"
- ]
- }
- ],
- "source": [
- "# 与天气无关提示调用\n",
- "messages = [\n",
- " {\n",
- " \"role\": \"user\",\n",
- " \"content\": \"你好!\",\n",
- " }\n",
- "]\n",
- "\n",
- "response = openai.ChatCompletion.create(\n",
- " model=\"gpt-3.5-turbo-0613\",\n",
- " messages=messages,\n",
- " functions=functions,\n",
- " function_call=\"auto\",\n",
- ")\n",
- "\n",
- "print(response)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "6ed07bb6",
- "metadata": {},
- "source": [
- "由于编码问题,我们可以单独打印对应的返回的文本信息"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "a89df974",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "你好!有什么我可以帮助你的吗?\n"
- ]
- }
- ],
- "source": [
- "print(response[\"choices\"][0][\"message\"][\"content\"])"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "c67fac41",
- "metadata": {
- "id": "c67fac41"
- },
- "source": [
- "## 二、Function Call参数模式\n",
- "\n",
- "`function_call`参数一共有3种模式,我们可以传递其他参数`function_call`以强制模型使用或不使用函数。\n",
- "\n",
- "1. 默认情况下,它设置为`auto`,也就是模型自行选择。\n",
- "2. 在第二种模式中,我们可以强制它调用一个函数。如果我们希望始终返回函数\n",
- "3. 另一种模式是`none`。这会强制语言模型不使用提供的任何函数。\n",
- "\n",
- "### 2.1 自动判断是否调用\n",
- "\n",
- "`auto`模式就是大模型自行选择是否要返回参数,这一部分也是默认的,上述所有的方式都是`auto`的模式\n",
- "\n",
- "### 2.2 强制不调用\n",
- "\n",
- "`none`模式对于这个例子来说并不重要,因为内容`你好`不需要函数调用,所以我们看到它没有被使用。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "id": "ba400aa9",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "ba400aa9",
- "outputId": "8b563e4f-c447-45a7-eb42-6390488a80b9"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{\n",
- " \"id\": \"chatcmpl-8nfuv5tlp6UaSwUfmeHppJkQyFpCW\",\n",
- " \"object\": \"chat.completion\",\n",
- " \"created\": 1706849893,\n",
- " \"model\": \"gpt-3.5-turbo-0613\",\n",
- " \"choices\": [\n",
- " {\n",
- " \"index\": 0,\n",
- " \"message\": {\n",
- " \"role\": \"assistant\",\n",
- " \"content\": \"\\u4f60\\u597d\\uff01\\u6709\\u4ec0\\u4e48\\u53ef\\u4ee5\\u5e2e\\u52a9\\u4f60\\u7684\\u5417\\uff1f\"\n",
- " },\n",
- " \"logprobs\": null,\n",
- " \"finish_reason\": \"stop\"\n",
- " }\n",
- " ],\n",
- " \"usage\": {\n",
- " \"prompt_tokens\": 88,\n",
- " \"completion_tokens\": 17,\n",
- " \"total_tokens\": 105\n",
- " },\n",
- " \"system_fingerprint\": null\n",
- "}\n",
- "你好!有什么可以帮助你的吗?\n"
- ]
- }
- ],
- "source": [
- "# 无关提示强制不调用\n",
- "messages = [\n",
- " {\n",
- " \"role\": \"user\",\n",
- " \"content\": \"你好\",\n",
- " }\n",
- "]\n",
- "response = openai.ChatCompletion.create(\n",
- " model=\"gpt-3.5-turbo-0613\",\n",
- " messages=messages,\n",
- " functions=functions,\n",
- " function_call=\"none\", # 传入参数强制不调用\n",
- ")\n",
- "print(response)\n",
- "print(response[\"choices\"][0][\"message\"][\"content\"])"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "3bcd0bb1",
- "metadata": {
- "id": "3bcd0bb1"
- },
- "source": [
- "那如果我们在需要使用工具的函数时候强制不调用(也就是使用第二种模式)会出现什么结果呢?从结果上来看,他依然有正常的`role`和`content`,但是因为我们强制不调用函数,所以他在试图返回正确的参数,但是并不正确。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "id": "d19386d5",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "d19386d5",
- "outputId": "0f206c8e-21e9-4aca-c6b2-7674650d723e"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{\n",
- " \"id\": \"chatcmpl-8nfv0wFX6tqcaoU2gT4KqICymXDxa\",\n",
- " \"object\": \"chat.completion\",\n",
- " \"created\": 1706849898,\n",
- " \"model\": \"gpt-3.5-turbo-0613\",\n",
- " \"choices\": [\n",
- " {\n",
- " \"index\": 0,\n",
- " \"message\": {\n",
- " \"role\": \"assistant\",\n",
- " \"content\": \"\\u8bf7\\u7a0d\\u7b49\\uff0c\\u6211\\u4e3a\\u60a8\\u67e5\\u627e\\u5317\\u4eac\\u7684\\u5929\\u6c14\\u60c5\\u51b5\\u3002\"\n",
- " },\n",
- " \"logprobs\": null,\n",
- " \"finish_reason\": \"stop\"\n",
- " }\n",
- " ],\n",
- " \"usage\": {\n",
- " \"prompt_tokens\": 95,\n",
- " \"completion_tokens\": 18,\n",
- " \"total_tokens\": 113\n",
- " },\n",
- " \"system_fingerprint\": null\n",
- "}\n",
- "请稍等,我为您查找北京的天气情况。\n"
- ]
- }
- ],
- "source": [
- "# 相关提示强制不调用\n",
- "messages = [\n",
- " {\n",
- " \"role\": \"user\",\n",
- " \"content\": \"北京天气怎么样?\",\n",
- " }\n",
- "]\n",
- "response = openai.ChatCompletion.create(\n",
- " model=\"gpt-3.5-turbo-0613\",\n",
- " messages=messages,\n",
- " functions=functions,\n",
- " function_call=\"none\", # 传入参数强制不调用\n",
- ")\n",
- "print(response)\n",
- "print(response[\"choices\"][0][\"message\"][\"content\"])"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "4ed657cb",
- "metadata": {
- "id": "4ed657cb"
- },
- "source": [
- "### 2.3 强制调用\n",
- "\n",
- "`function_call`参数的最后一个模式是强制调用函数,方法也是很简单,只要在参数里面传入这个函数的名字即可。在这一部分,我们指定`name`为`get_current_weather`,这将强制它使用`get_current_weather`函数。如果我们查看结果,我们实际上会看到返回了这个`function_call`对象,其中有`name`为`get_current_weather`的一些参数。\n",
- "\n",
- "为了好玩,这里面传了一个于天气无关的参数,然后强制他使用`get_current_weather`函数,但我们传入的参数中绝对没有任何信息关于它应该如何调用该函数。所以在这里,他编造了北京,北京市的参数,如果我们再次运行它,它会不断地调用北京市的参数。\n",
- "\n",
- "我们还可以尝试使用不同的模型函数来调用,不同的参数值,不同的输入消息,但是这里面要注意一个事情,首先,函数本身和描述都会计入传递给OpenAI的令牌使用限制。所以如果我们运行这个,我们可以看到返回的提示令牌是95。如果我们注释掉`functions`和`function_call`,我们可以看到提示令牌减少到15。因为OpenAI模型对令牌有限制,因此,在构造要传递给OpenAI的消息时,现在不仅需要注意消息的长度,还需要注意传递的函数的长度。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "id": "c432ee4c",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "c432ee4c",
- "outputId": "507ca457-04ae-4a97-f7cd-6c7efd278ea8"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{\n",
- " \"id\": \"chatcmpl-8nfvkQDayf7kmYNdx4G84dkl1L5YC\",\n",
- " \"object\": \"chat.completion\",\n",
- " \"created\": 1706849944,\n",
- " \"model\": \"gpt-3.5-turbo-0613\",\n",
- " \"choices\": [\n",
- " {\n",
- " \"index\": 0,\n",
- " \"message\": {\n",
- " \"role\": \"assistant\",\n",
- " \"content\": null,\n",
- " \"function_call\": {\n",
- " \"name\": \"get_current_weather\",\n",
- " \"arguments\": \"{\\n \\\"location\\\": \\\"\\u5317\\u4eac\\uff0c\\u5317\\u4eac\\u5e02\\\"\\n}\"\n",
- " }\n",
- " },\n",
- " \"logprobs\": null,\n",
- " \"finish_reason\": \"stop\"\n",
- " }\n",
- " ],\n",
- " \"usage\": {\n",
- " \"prompt_tokens\": 95,\n",
- " \"completion_tokens\": 12,\n",
- " \"total_tokens\": 107\n",
- " },\n",
- " \"system_fingerprint\": null\n",
- "}\n",
- "{\n",
- " \"location\": \"北京,北京市\"\n",
- "}\n"
- ]
- }
- ],
- "source": [
- "# 无关提示强制调用函数\n",
- "messages = [\n",
- " {\n",
- " \"role\": \"user\",\n",
- " \"content\": \"你好!\",\n",
- " }\n",
- "]\n",
- "response = openai.ChatCompletion.create(\n",
- " model=\"gpt-3.5-turbo-0613\",\n",
- " messages=messages,\n",
- " functions=functions,\n",
- " function_call={\"name\": \"get_current_weather\"}, # 强制调用函数get_current_weather\n",
- ")\n",
- "print(response)\n",
- "print(response[\"choices\"][0][\"message\"][\"function_call\"][\"arguments\"])"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "f3306ca8",
- "metadata": {
- "id": "f3306ca8"
- },
- "source": [
- "## 三、函数调用以及执行函数\n",
- "\n",
- "最后,让我们看一下如何将这些函数调用和实际执行函数调用的结果传递回语言模型。这很重要,因为通常我们希望使用语言模型确定要调用的函数,然后运行该函数,但然后将其传递回语言模型以获得最终响应。\n",
- "\n",
- "我们会经过这样的流程,首先我们问问题得到一个带有`function call`参数的响应,然后我们将这个消息添加到我们的消息列表中。然后我们可以模拟使用语言模型提供的参数调用`get_current_weather`函数,并且保存到一个变量`observation`中。接着我们定义一个新的消息列表,表示刚刚调用函数的结果,这里面有一个重要的点就是`role`等于`function`,也就是告诉语言模型这是调用函数的响应。除此之外,还传递函数的名称`name`以及`content`变量,设为上述计算的`observation`。\n",
- "\n",
- "如果我们然后使用这个消息列表调用语言模型,我们可以看到语言模型回答的非常好:北京的天气目前是22摄氏度,天气以晴天为主,也有多云的情况。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "f1636ae0",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{\n",
- " \"choices\": [\n",
- " {\n",
- " \"finish_reason\": \"stop\",\n",
- " \"index\": 0,\n",
- " \"logprobs\": null,\n",
- " \"message\": {\n",
- " \"content\": \"\\u4f60\\u597d\\uff01\\u6709\\u4ec0\\u4e48\\u6211\\u53ef\\u4ee5\\u5e2e\\u52a9\\u4f60\\u7684\\u5417\\uff1f\",\n",
- " \"role\": \"assistant\"\n",
- " }\n",
- " }\n",
- " ],\n",
- " \"created\": 1709602993,\n",
- " \"id\": \"chatcmpl-8zE7lZ3keV5JGVTFqvAWOGxit4cXj\",\n",
- " \"model\": \"gpt-3.5-turbo-0613\",\n",
- " \"object\": \"chat.completion\",\n",
- " \"system_fingerprint\": null,\n",
- " \"usage\": {\n",
- " \"completion_tokens\": 19,\n",
- " \"prompt_tokens\": 88,\n",
- " \"total_tokens\": 107\n",
- " }\n",
- "}\n"
- ]
- }
- ],
- "source": [
- "# 与天气无关提示调用\n",
- "messages = [\n",
- " {\n",
- " \"role\": \"user\",\n",
- " \"content\": \"你好!\",\n",
- " }\n",
- "]\n",
- "\n",
- "response = openai.ChatCompletion.create(\n",
- " model=\"gpt-3.5-turbo-0613\",\n",
- " messages=messages,\n",
- " functions=functions,\n",
- " function_call=\"auto\",\n",
- ")\n",
- "\n",
- "print(response)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "b1577b5b",
- "metadata": {},
- "source": [
- "由于编码问题,我们可以单独答应对应的返回的文本信息"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "2f6dff4a",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "你好!有什么我可以帮助你的吗?\n"
- ]
- }
- ],
- "source": [
- "print(response[\"choices\"][0][\"message\"][\"content\"])"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "5940bf6d",
- "metadata": {},
- "source": [
- "## 四、英文版提示\n",
- "\n",
- "**1.1 简单例子:得到当前天气**"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "id": "61a23996",
- "metadata": {
- "id": "61a23996"
- },
- "outputs": [],
- "source": [
- "import json\n",
- "\n",
- "# Example dummy function hard coded to return the same weather\n",
- "# In production, this could be your backend API or an external API\n",
- "def get_current_weather(location, unit=\"fahrenheit\"):\n",
- " \"\"\"Get the current weather in a given location\"\"\"\n",
- " weather_info = {\n",
- " \"location\": location,\n",
- " \"temperature\": \"72\",\n",
- " \"unit\": unit,\n",
- " \"forecast\": [\"sunny\", \"windy\"],\n",
- " }\n",
- " return json.dumps(weather_info)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "44c5d2d2",
- "metadata": {
- "id": "44c5d2d2"
- },
- "source": [
- "**1.2 定义function函数**"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "id": "4152b105",
- "metadata": {
- "id": "4152b105"
- },
- "outputs": [],
- "source": [
- "# define a function\n",
- "functions = [\n",
- " {\n",
- " \"name\": \"get_current_weather\",\n",
- " \"description\": \"Get the current weather in a given location\",\n",
- " \"parameters\": {\n",
- " \"type\": \"object\",\n",
- " \"properties\": {\n",
- " \"location\": {\n",
- " \"type\": \"string\",\n",
- " \"description\": \"The city and state, e.g. San Francisco, CA\",\n",
- " },\n",
- " \"unit\": {\"type\": \"string\", \"enum\": [\"celsius\", \"fahrenheit\"]},\n",
- " },\n",
- " \"required\": [\"location\"],\n",
- " },\n",
- " }\n",
- "]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "668eb81c",
- "metadata": {
- "id": "668eb81c"
- },
- "source": [
- "**1.3 相关提示调用结果**"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "id": "a6bd2994",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "a6bd2994",
- "outputId": "ae7da376-c365-4177-a1d0-a4df92f6815e"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{\n",
- " \"id\": \"chatcmpl-8nfyFSmPyo4RoV79D5Urk1QzLCf1C\",\n",
- " \"object\": \"chat.completion\",\n",
- " \"created\": 1706850099,\n",
- " \"model\": \"gpt-3.5-turbo-0613\",\n",
- " \"choices\": [\n",
- " {\n",
- " \"index\": 0,\n",
- " \"message\": {\n",
- " \"role\": \"assistant\",\n",
- " \"content\": null,\n",
- " \"function_call\": {\n",
- " \"name\": \"get_current_weather\",\n",
- " \"arguments\": \"{\\n \\\"location\\\": \\\"Boston, MA\\\"\\n}\"\n",
- " }\n",
- " },\n",
- " \"logprobs\": null,\n",
- " \"finish_reason\": \"function_call\"\n",
- " }\n",
- " ],\n",
- " \"usage\": {\n",
- " \"prompt_tokens\": 82,\n",
- " \"completion_tokens\": 18,\n",
- " \"total_tokens\": 100\n",
- " },\n",
- " \"system_fingerprint\": null\n",
- "}\n"
- ]
- }
- ],
- "source": [
- "messages = [\n",
- " {\n",
- " \"role\": \"user\",\n",
- " \"content\": \"What's the weather like in Boston?\"\n",
- " }\n",
- "]\n",
- "\n",
- "response = openai.ChatCompletion.create(\n",
- " model=\"gpt-3.5-turbo-0613\",\n",
- " messages=messages,\n",
- " functions=functions\n",
- ")\n",
- "\n",
- "print(response)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "a60020d4",
- "metadata": {
- "id": "a60020d4"
- },
- "source": [
- "**1.4 无关提示调用结果**"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "id": "b04b3318",
- "metadata": {
- "id": "b04b3318"
- },
- "outputs": [],
- "source": [
- "messages = [\n",
- " {\n",
- " \"role\": \"user\",\n",
- " \"content\": \"hi!\",\n",
- " }\n",
- "]\n",
- "\n",
- "response = openai.ChatCompletion.create(\n",
- " model=\"gpt-3.5-turbo-0613\",\n",
- " messages=messages,\n",
- " functions=functions,\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "bfdbfe18",
- "metadata": {
- "id": "bfdbfe18"
- },
- "source": [
- "**2.1 自动判断是否调用**\n",
- "\n",
- "自动判断是否调用函数,判断提示是否和函数相关"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "id": "e8c48d2b",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "e8c48d2b",
- "outputId": "cf492711-60a5-42a7-e154-09a3b3174da1"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{\n",
- " \"id\": \"chatcmpl-8nfyHfC32MVE2gpE6HZxtmC7f7Mvi\",\n",
- " \"object\": \"chat.completion\",\n",
- " \"created\": 1706850101,\n",
- " \"model\": \"gpt-3.5-turbo-0613\",\n",
- " \"choices\": [\n",
- " {\n",
- " \"index\": 0,\n",
- " \"message\": {\n",
- " \"role\": \"assistant\",\n",
- " \"content\": \"Hello! How can I assist you today?\"\n",
- " },\n",
- " \"logprobs\": null,\n",
- " \"finish_reason\": \"stop\"\n",
- " }\n",
- " ],\n",
- " \"usage\": {\n",
- " \"prompt_tokens\": 76,\n",
- " \"completion_tokens\": 10,\n",
- " \"total_tokens\": 86\n",
- " },\n",
- " \"system_fingerprint\": null\n",
- "}\n"
- ]
- }
- ],
- "source": [
- "messages = [\n",
- " {\n",
- " \"role\": \"user\",\n",
- " \"content\": \"hi!\",\n",
- " }\n",
- "]\n",
- "response = openai.ChatCompletion.create(\n",
- " model=\"gpt-3.5-turbo-0613\",\n",
- " messages=messages,\n",
- " functions=functions,\n",
- " function_call=\"auto\",\n",
- ")\n",
- "print(response)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "32eb9b08",
- "metadata": {
- "id": "32eb9b08"
- },
- "source": [
- "**2.2 强制不调用**\n",
- "\n",
- "无关提示强制不调用,结果正常"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 22,
- "id": "ba53560e",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "ba53560e",
- "outputId": "c529ab29-8c49-4b2c-9645-10ae8c018f7b"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{\n",
- " \"id\": \"chatcmpl-8nfyfrOoacEQCPO3O2sYLukAdbxX4\",\n",
- " \"object\": \"chat.completion\",\n",
- " \"created\": 1706850125,\n",
- " \"model\": \"gpt-3.5-turbo-0613\",\n",
- " \"choices\": [\n",
- " {\n",
- " \"index\": 0,\n",
- " \"message\": {\n",
- " \"role\": \"assistant\",\n",
- " \"content\": \"Hello! How can I assist you today?\"\n",
- " },\n",
- " \"logprobs\": null,\n",
- " \"finish_reason\": \"stop\"\n",
- " }\n",
- " ],\n",
- " \"usage\": {\n",
- " \"prompt_tokens\": 77,\n",
- " \"completion_tokens\": 9,\n",
- " \"total_tokens\": 86\n",
- " },\n",
- " \"system_fingerprint\": null\n",
- "}\n"
- ]
- }
- ],
- "source": [
- "messages = [\n",
- " {\n",
- " \"role\": \"user\",\n",
- " \"content\": \"hi!\",\n",
- " }\n",
- "]\n",
- "response = openai.ChatCompletion.create(\n",
- " model=\"gpt-3.5-turbo-0613\",\n",
- " messages=messages,\n",
- " functions=functions,\n",
- " function_call=\"none\",\n",
- ")\n",
- "print(response)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "0e037da0",
- "metadata": {
- "id": "0e037da0"
- },
- "source": [
- "**2.3 强制调用**\n",
- "\n",
- "强制无关问题的调用"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "id": "32b16792",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "32b16792",
- "outputId": "37b06217-4632-46e4-ce07-453e73814081"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{\n",
- " \"id\": \"chatcmpl-8nfyzFFeUyzJDYVbdiVDgPEtNrnKE\",\n",
- " \"object\": \"chat.completion\",\n",
- " \"created\": 1706850145,\n",
- " \"model\": \"gpt-3.5-turbo-0613\",\n",
- " \"choices\": [\n",
- " {\n",
- " \"index\": 0,\n",
- " \"message\": {\n",
- " \"role\": \"assistant\",\n",
- " \"content\": null,\n",
- " \"function_call\": {\n",
- " \"name\": \"get_current_weather\",\n",
- " \"arguments\": \"{\\n \\\"location\\\": \\\"San Francisco, CA\\\"\\n}\"\n",
- " }\n",
- " },\n",
- " \"logprobs\": null,\n",
- " \"finish_reason\": \"stop\"\n",
- " }\n",
- " ],\n",
- " \"usage\": {\n",
- " \"prompt_tokens\": 83,\n",
- " \"completion_tokens\": 12,\n",
- " \"total_tokens\": 95\n",
- " },\n",
- " \"system_fingerprint\": null\n",
- "}\n"
- ]
- }
- ],
- "source": [
- "\n",
- "messages = [\n",
- " {\n",
- " \"role\": \"user\",\n",
- " \"content\": \"hi!\",\n",
- " }\n",
- "]\n",
- "response = openai.ChatCompletion.create(\n",
- " model=\"gpt-3.5-turbo-0613\",\n",
- " messages=messages,\n",
- " functions=functions,\n",
- " function_call={\"name\": \"get_current_weather\"},\n",
- ")\n",
- "print(response)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "65b85a74",
- "metadata": {
- "id": "65b85a74"
- },
- "source": [
- "强制有关问题调用,结果和`auto`一样"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "id": "b6f955de",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "b6f955de",
- "outputId": "b8134f2e-1746-4139-83f4-ac51327549cb"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{\n",
- " \"id\": \"chatcmpl-8nfzav6WbYIDLPHXYwaBXTM0byxaj\",\n",
- " \"object\": \"chat.completion\",\n",
- " \"created\": 1706850182,\n",
- " \"model\": \"gpt-3.5-turbo-0613\",\n",
- " \"choices\": [\n",
- " {\n",
- " \"index\": 0,\n",
- " \"message\": {\n",
- " \"role\": \"assistant\",\n",
- " \"content\": null,\n",
- " \"function_call\": {\n",
- " \"name\": \"get_current_weather\",\n",
- " \"arguments\": \"{\\n \\\"location\\\": \\\"Boston, MA\\\"\\n}\"\n",
- " }\n",
- " },\n",
- " \"logprobs\": null,\n",
- " \"finish_reason\": \"stop\"\n",
- " }\n",
- " ],\n",
- " \"usage\": {\n",
- " \"prompt_tokens\": 89,\n",
- " \"completion_tokens\": 11,\n",
- " \"total_tokens\": 100\n",
- " },\n",
- " \"system_fingerprint\": null\n",
- "}\n"
- ]
- }
- ],
- "source": [
- "messages = [\n",
- " {\n",
- " \"role\": \"user\",\n",
- " \"content\": \"What's the weather like in Boston!\",\n",
- " }\n",
- "]\n",
- "response = openai.ChatCompletion.create(\n",
- " model=\"gpt-3.5-turbo-0613\",\n",
- " messages=messages,\n",
- " functions=functions,\n",
- " function_call={\"name\": \"get_current_weather\"},\n",
- ")\n",
- "print(response)"
- ]
- }
- ],
- "metadata": {
- "colab": {
- "provenance": []
- },
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.9.13"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git "a/content/\351\200\211\344\277\256-Functions, Tools and Agents with LangChain/3. LangChain\350\241\250\350\276\276\345\274\217\350\257\255\350\250\200 LCEL.ipynb" "b/content/\351\200\211\344\277\256-Functions, Tools and Agents with LangChain/3. LangChain\350\241\250\350\276\276\345\274\217\350\257\255\350\250\200 LCEL.ipynb"
deleted file mode 100644
index 1832a8e5..00000000
--- "a/content/\351\200\211\344\277\256-Functions, Tools and Agents with LangChain/3. LangChain\350\241\250\350\276\276\345\274\217\350\257\255\350\250\200 LCEL.ipynb"
+++ /dev/null
@@ -1,1984 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "c5f7fce3",
- "metadata": {
- "id": "c5f7fce3"
- },
- "source": [
- "# 第三章 LangChain 表达式语言 LangChain Expression Language\n",
- "\n",
- "在这一章我们会介绍 LangChain Expression Language(或称为 LCEL),被称之为 Langchain 的表达式语言。LCEL 是一种新的语法,是 LangChain 工具包的重要补充,他有许多优点,使得我们处理 LangChain 和代理更加简单方便。\n",
- "\n",
- "1. LCEL 提供了异步、批处理和流处理支持,这使得代码多功能化,并且代码可以快速在不同服务器中应用和运行。\n",
- " - 异步:程序可以同时执行多个任务,而不是按照顺序一个接一个地执行\n",
- " - 批处理:是一种将一组任务或数据作为一个批次进行处理的方法,而不是逐个处理\n",
- " - 流式处理:理是一种连续处理数据的方法,数据会持续不断地进入系统并被处理,流式处理能够在数据到达时立即进行处理,并且可以以持续且低延迟的方式处理数据。\n",
- "\n",
- "2. LCEL 拥有 fallbacks 措施,也叫回退安全机制,有时LLM得到的结果不可控,这时你可以将结果进行回退,甚至可以附加到整个链上\n",
- "\n",
- "3. LCEL 增加了 LLM 的并行性,LLM 运行通常是耗费时间的,并行可以加快得到结果的速度。\n",
- "\n",
- "4. LCEL 内置了日志记录,记录代理的运行情况。即使代理复杂,日志也有助于理解复杂链条和代理的运行情况。\n",
- "\n",
- "在前面的课程中,我们知道 LangChain 提供了组件 链(chain) 可以将组件组合起来发挥 LLM 更强大的功能,但是语法非常复杂。在这里,LCEL 提供了一种管道语法,使从基本组件构建复杂链变得容易,我们可以通过 LangChain 完成`Chain = prompt | LLM |OutputParser `的组合,具体使用我们将在下文内容中讨论。链(Chains)通常将大语言模型(LLM)与提示(Prompt)结合在一起,基于此,我们可以对文本或数据进行一系列操作。\n",
- "\n",
- "\n",
- "\n",
- "- [一、简单链 Simple Chain](#一、简单链-Simple-Chain)\n",
- "- [二、更复杂的链 More complex chain](#二、更复杂的链-More-complex-chain)\n",
- " - [1.1 构建简单向量数据库](#1.1-构建简单向量数据库)\n",
- " - [1.2 使用RunnableMap](#1.2-使用RunnableMap)\n",
- "- [三、绑定 Bind](#三、绑定-Bind)\n",
- " - [3.1 单函数绑定](#3.1-单函数绑定)\n",
- " - [3.2 多个函数绑定](#3.2-多个函数绑定)\n",
- "- [四、后备措施 Fallbacks](#四、后备措施-Fallbacks)\n",
- " - [4.1 使用早期模型格式化输出](#4.1-使用早期模型格式化输出)\n",
- " - [4.2 使用新模型格式化输出](#4.2-使用新模型格式化输出)\n",
- " - [4.3 fallbacks方法](#4.3-fallbacks方法)\n",
- "- [五、接口 Interface](#五、接口-Interface)\n",
- " - [5.1 invoke接口](#5.1-invoke接口)\n",
- " - [5.2 batch接口](#5.2-batch接口)\n",
- " - [5.3 stream接口](#5.3-stream接口)\n",
- " - [5.4 异步接口](#5.4-异步接口)\n",
- "- [六、英文版提示](#六、英文版提示)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "b631e459",
- "metadata": {},
- "source": [
- "## 一、简单链 Simple Chain\n",
- "\n",
- "接下来我们依旧会使用 OpenAI 的 API,所以首先我们要初始化我们的 API_Key,这个方法和上一章的方式是一样的。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "cacb572a",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "cacb572a",
- "outputId": "1acef39b-494e-445e-bde0-2e8f83a2a84c"
- },
- "outputs": [],
- "source": [
- "# !pip install langchain\n",
- "# !pip install openai==0.28\n",
- "# !pip install \"langchain[docarray]\"\n",
- "# !pip install tiktoken"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "c52952e6",
- "metadata": {
- "id": "c52952e6"
- },
- "outputs": [],
- "source": [
- "import os\n",
- "import openai\n",
- "\n",
- "os.environ['OPENAI_API_KEY'] = \"YOUR_API_KEY\"\n",
- "openai.api_key = os.environ['OPENAI_API_KEY']"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "0a7f48e9",
- "metadata": {
- "id": "0a7f48e9"
- },
- "source": [
- "接下来首先导入 LangChain 的库,并且定义一个简单的链,这个链包括提示模板,大语言模型和一个输出解析器。我们可以看到,成功输出了大语言模型的结果,完成了一个简单的链。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "81ac70fe",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 91
- },
- "id": "81ac70fe",
- "outputId": "00302a8a-42eb-4eaa-ff0f-8540245932cd"
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/usr/local/lib/python3.10/dist-packages/langchain_core/_api/deprecation.py:117: LangChainDeprecationWarning: The class `langchain_community.chat_models.openai.ChatOpenAI` was deprecated in langchain-community 0.0.10 and will be removed in 0.2.0. An updated version of the class exists in the langchain-openai package and should be used instead. To use it run `pip install -U langchain-openai` and import as `from langchain_openai import ChatOpenAI`.\n",
- " warn_deprecated(\n"
- ]
- },
- {
- "data": {
- "application/vnd.google.colaboratory.intrinsic+json": {
- "type": "string"
- },
- "text/plain": [
- "'为什么熊不喜欢玩扑克牌?因为他总是把两个熊掌都露出来!'"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# 导入LangChain所需的模块\n",
- "from langchain.prompts import ChatPromptTemplate\n",
- "from langchain.chat_models import ChatOpenAI\n",
- "from langchain.schema.output_parser import StrOutputParser\n",
- "\n",
- "# 使用 ChatPromptTemplate 从模板创建一个提示,模板中的 {topic} 将在后续代码中替换为实际的话题\n",
- "prompt = ChatPromptTemplate.from_template(\n",
- " \"告诉我一个关于{topic}的短笑话\"\n",
- ")\n",
- "\n",
- "# 创建一个 ChatOpenAI 模型实例,默认使用 gpt-3.5-turbo 模型\n",
- "model = ChatOpenAI()\n",
- "\n",
- "# 创建一个StrOutputParser实例,用于解析输出\n",
- "output_parser = StrOutputParser()\n",
- "\n",
- "# 创建一个链式调用,将 prompt、model 和output_parser 连接在一起\n",
- "chain = prompt | model | output_parser\n",
- "\n",
- "# 调用链式调用,并传入参数\n",
- "chain.invoke({\"topic\": \"熊\"})"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "7563f727",
- "metadata": {
- "id": "7563f727"
- },
- "source": [
- "如果我们去查看`Chain`的输出,我们会发现,他跟我们定义的是一样的,一共有三部分进行组成,也就是`Chain = prompt | LLM |OutputParser `。`|`符号类似于 unix 管道操作符,它将不同的组件链接在一起,将一个组件的输出作为输入提供给下一个组件。在这个链中,用户输入被传递给提示模板,然后提示模板输出被传递给模型,然后模型输出被传递到输出解析器。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "id": "a351d14a",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "a351d14a",
- "outputId": "e9a7f195-fdfb-4523-eb34-a708ce0e0b24"
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "ChatPromptTemplate(input_variables=['topic'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['topic'], template='告诉我一个关于{topic}的短笑话'))])\n",
- "| ChatOpenAI(client=, openai_api_key='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', openai_proxy='')\n",
- "| StrOutputParser()"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# 查看Chain的值\n",
- "chain"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "35f8c80e",
- "metadata": {
- "id": "35f8c80e"
- },
- "source": [
- "## 二、更复杂的链 More complex chain\n",
- "\n",
- "接下来,我们会创建一个更复杂的链条,在之前的课程中,我们接触过如何进行检索增强生成。所以接下来我们使用 LCEL 来重复之前的过程,将用户的问题和向量数据库检索结果结合起来,使用 RunnableMap 来构建一个更复杂的链。\n",
- "\n",
- "### 2.1 构建简单向量数据库\n",
- "首先我们构建一个向量数据库,这个简单的向量数据库只包含两句话,使用 OpenAI 的 Embedding 作为嵌入模型,然后我们通过 `vector store.as_retriever `来创建一个检索器。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "id": "308eab74",
- "metadata": {
- "id": "308eab74"
- },
- "outputs": [],
- "source": [
- "from langchain.embeddings import OpenAIEmbeddings\n",
- "from langchain.vectorstores import DocArrayInMemorySearch\n",
- "\n",
- "# 创建一个DocArrayInMemorySearch对象,用于存储和搜索文档向量\n",
- "vectorstore = DocArrayInMemorySearch.from_texts(\n",
- " [\"哈里森在肯肖工作\", \"熊喜欢吃蜂蜜\"],\n",
- " embedding=OpenAIEmbeddings() # 使用OpenAI的Embedding\n",
- ")\n",
- "\n",
- "# 创建一个检索器\n",
- "retriever = vectorstore.as_retriever()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "768c1f68",
- "metadata": {
- "id": "768c1f68"
- },
- "source": [
- "通过之前的学习,如果我们调用`retriever.get_relevant_documents`,我们会得到相关的检索文档,首先我们问“哈里森在哪里工作?”,我们会发现返回了一个文档列表,他会根据相似度排序返回文档列表,所以其中最相关的放在了第一个。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "id": "f83e9118",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "f83e9118",
- "outputId": "4b3e4b1f-c378-43ec-8a2e-4b6a6a0d88b0"
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[Document(page_content='哈里森在肯肖工作'), Document(page_content='熊喜欢吃蜂蜜')]"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# 获取与问题“哈里森在哪里工作?”相关的文档\n",
- "retriever.get_relevant_documents(\"哈里森在哪里工作?\")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "a4b5eacb",
- "metadata": {
- "id": "a4b5eacb"
- },
- "source": [
- "如果我们换一个问题,比如\"熊喜欢吃什么\",可以看到问题的顺序就发生了变化。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "id": "7d76aaf5",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "7d76aaf5",
- "outputId": "8e79e2d9-8572-44e0-edf7-70cd254e0252"
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[Document(page_content='熊喜欢吃蜂蜜'), Document(page_content='哈里森在肯肖工作')]"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# 获取与问题“熊喜欢吃什么”相关的文档\n",
- "retriever.get_relevant_documents(\"熊喜欢吃什么\")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "fb34293a",
- "metadata": {
- "id": "fb34293a"
- },
- "source": [
- "### 3.2 使用RunnableMap\n",
- "\n",
- "上述例子返回两个结果是因为只有两个文档列表,这完全适用于更多文档情况。接下来我们会加入`RunnableMap`,在这个`RunnableMap`中,不仅仅有用户的问题,以及有对应的问题的文档列表,相当于这也为大模型的文档增加了上下文,这样就能完成检索增强的事情。如果我们正常问一个问题,可以看到,大模型正确的返回了文档里面的结果,得到了正确的输出。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "id": "e8a9b652",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 35
- },
- "id": "e8a9b652",
- "outputId": "d615b86a-a14b-4917-833f-e70c91730477"
- },
- "outputs": [
- {
- "data": {
- "application/vnd.google.colaboratory.intrinsic+json": {
- "type": "string"
- },
- "text/plain": [
- "'肯肖'"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from langchain.schema.runnable import RunnableMap\n",
- "\n",
- "# 定义一个模板字符串template\n",
- "template = \"\"\"仅根据以下上下文回答问题:\n",
- "{context}\n",
- "\n",
- "问题:{question}\n",
- "\"\"\"\n",
- "\n",
- "# 使用 template 作为模板\n",
- "prompt = ChatPromptTemplate.from_template(template)\n",
- "\n",
- "# 创建一个处理链 chain ,包含了 RunnableMap、prompt、model 和 output_parser 组件\n",
- "chain = RunnableMap({\n",
- " \"context\": lambda x: retriever.get_relevant_documents(x[\"question\"]),\n",
- " \"question\": lambda x: x[\"question\"]\n",
- "}) | prompt | model | output_parser\n",
- "\n",
- "# 调用chain的invoke方法\n",
- "chain.invoke({\"question\": \"哈里森在哪里工作?\"})"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "394af88e",
- "metadata": {
- "id": "394af88e"
- },
- "source": [
- "如果我们想更深入挖掘一下背后的工作机理,我们可以看一下`RunnableMap`,我们把其创建为一个输入,用一样的方式进行操作。我们可以看到,在这之中,`RunnableMap`提供了`context`和`question`两个变量,一个是查询的文档列表,另一个是对应的问题,这个大模型就可以根据提供文档来总结回答对应的问题了。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "id": "759d2af6",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "759d2af6",
- "outputId": "fa16fa6c-dcb0-46e9-e538-428e711db372"
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'context': [Document(page_content='哈里森在肯肖工作'),\n",
- " Document(page_content='熊喜欢吃蜂蜜')],\n",
- " 'question': '哈里森在哪里工作?'}"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# 创建一个RunnableMap对象,其中包含两个键值对\n",
- "# 键 \"context\" 对应一个lambda函数,用于获取相关文档,函数输入参数为x,即输入的字典,函数返回值为retriever.get_relevant_documents(x[\"question\"])\n",
- "# 键 \"question\" 对应一个lambda函数,用于获取问题,函数输入参数为x,即输入的字典,函数返回值为x[\"question\"]\n",
- "inputs = RunnableMap({\n",
- " \"context\": lambda x: retriever.get_relevant_documents(x[\"question\"]),\n",
- " \"question\": lambda x: x[\"question\"]\n",
- "})\n",
- "\n",
- "# 调用 inputs 的 invoke 方法,并传递一个字典作为参数,字典中包含一个键值对,键为\"question\",值为\"哈里森在哪里工作?\"\n",
- "inputs.invoke({\"question\": \"哈里森在哪里工作?\"})"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "2c01b704",
- "metadata": {
- "id": "2c01b704"
- },
- "source": [
- "## 三、绑定 Bind\n",
- "\n",
- "在上一章我们介绍了OpenAI函数的调用,新的`function`参数可以自动判断是否要使用工具函数,如果需要就会返回需要使用的参数。接下来我们也使用LangChain实现OpenAI函数调用的新功能,首先需要一个函数的描述信息,以及定义函数,这里的函数还是使用上一章的`get_current_weather`函数。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "id": "1b00f651",
- "metadata": {
- "id": "1b00f651"
- },
- "outputs": [],
- "source": [
- "# 定义一个函数\n",
- "functions = [\n",
- " {\n",
- " \"name\": \"get_current_weather\",\n",
- " \"description\": \"获取指定位置的当前天气情况\",\n",
- " \"parameters\": {\n",
- " \"type\": \"object\",\n",
- " \"properties\": {\n",
- " \"location\": {\n",
- " \"type\": \"string\",\n",
- " \"description\": \"城市和省份,例如:北京,北京市\",\n",
- " },\n",
- " \"unit\": {\"type\": \"string\", \"enum\": [\"摄氏度\", \"华氏度\"]},\n",
- " },\n",
- " \"required\": [\"location\"],\n",
- " },\n",
- " }\n",
- "]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "5d2ff71d",
- "metadata": {
- "id": "5d2ff71d"
- },
- "source": [
- "### 3.1 单函数绑定\n",
- "\n",
- "接下来我们使用`bind`的方法把工具函数绑定到大模型上,并构建一个简单的链。进行调用以后,我们可以看到返回了一个`AIMessage`,\t其中返回的`content`为空,但是返回了我们需要调用工具函数的参数。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "id": "2f693ff9",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "2f693ff9",
- "outputId": "9ebdd754-906f-4178-a31a-899ed3c7d304"
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "AIMessage(content='', additional_kwargs={'function_call': {'name': 'get_current_weather', 'arguments': '{\\n \"location\": \"北京,北京市\"\\n}'}})"
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# 使用ChatPromptTemplate.from_messages方法创建一个ChatPromptTemplate对象\n",
- "prompt = ChatPromptTemplate.from_messages(\n",
- " [\n",
- " (\"human\", \"{input}\")\n",
- " ]\n",
- ")\n",
- "\n",
- "# 使用bind方法绑定functions参数\n",
- "model = ChatOpenAI(temperature=0).bind(functions=functions)\n",
- "\n",
- "runnable = prompt | model\n",
- "\n",
- "# 调用invoke方法\n",
- "runnable.invoke({\"input\": \"北京天气怎么样?\"})"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "89a5fb09",
- "metadata": {
- "id": "89a5fb09"
- },
- "source": [
- "### 3.2 多个函数绑定\n",
- "\n",
- "同时我们也可以定义多个`function`,大模型在对话的时候可以自动判断使用哪一个函数。这里面我们定义有两个函数,第一个函数是类似于前面的`weather_search`,搜索给定机场的天气,然后我们还定义了一个赛事体育新闻搜索的`sports_search`,查询天气的函数`weather_search`接受的参数为airport_code即机场代码,体育新闻搜索函数`sports_search`接受的参数为team_name即体育队名。由于这里我们不需要运行这些函数,因为大模型是通过问的问题来自动判断是否调用这些函数,并且返回参数,并不会直接帮我们调用。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "id": "b95f4029",
- "metadata": {
- "id": "b95f4029"
- },
- "outputs": [],
- "source": [
- "functions = [\n",
- " {\n",
- " \"name\": \"weather_search\",\n",
- " \"description\": \"搜索给定机场代码的天气\",\n",
- " \"parameters\": {\n",
- " \"type\": \"object\",\n",
- " \"properties\": {\n",
- " \"airport_code\": {\n",
- " \"type\": \"string\",\n",
- " \"description\": \"要获取天气的机场代码\"\n",
- " },\n",
- " },\n",
- " \"required\": [\"airport_code\"]\n",
- " }\n",
- " },\n",
- " {\n",
- " \"name\": \"sports_search\",\n",
- " \"description\": \"搜索最近体育赛事的新闻\",\n",
- " \"parameters\": {\n",
- " \"type\": \"object\",\n",
- " \"properties\": {\n",
- " \"team_name\": {\n",
- " \"type\": \"string\",\n",
- " \"description\": \"要搜索的体育队名\"\n",
- " },\n",
- " },\n",
- " \"required\": [\"team_name\"]\n",
- " }\n",
- " }\n",
- "]"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "d4a77d53",
- "metadata": {
- "id": "d4a77d53"
- },
- "source": [
- "接着我们就可以使用函数绑定大模型,定义一个简单的链,我们可以看到,当我们问了相关的问题以后,大模型能够自动判断并且正确返回参数,知道需要调用函数了。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "id": "519e761d",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "519e761d",
- "outputId": "7ad43d65-bd62-4339-c8f8-e5c74a414dac"
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "AIMessage(content='', additional_kwargs={'function_call': {'name': 'sports_search', 'arguments': '{\\n \"team_name\": \"爱国者队\"\\n}'}})"
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# 绑定大模型\n",
- "model = model.bind(functions=functions)\n",
- "runnable = prompt | model\n",
- "\n",
- "runnable.invoke({\"input\": \"爱国者队昨天表现的怎么样?\"})"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "0b988820",
- "metadata": {
- "id": "0b988820"
- },
- "source": [
- "## 四、后备措施 Fallbacks\n",
- "\n",
- "在使用早期的OpenAI模型如\"text-davinci-001\",这些模型在对话过程中,不支持格式化输出结果即它们都是以字符串的形式输出结果,这对我们有时候需要解析 LLM 的输出带来一些麻烦,比如下面这个例子,就是利用早期模型\"text-davinci-001\"来回答用户的问题,我们希望 llm 能以 json 格式输出结果。\n",
- "\n",
- "我们定义了 OpenAI 的模型以及创建了一个简单的链,以此加入 json 希望能以 json 格式输出结果,我们让 simple_model 写三首诗,并以 josn 格式输出,每首诗必须包含:`标题,作者和诗的第一句`。我们会发现结果只有字符串,无法输出指定格式的内容,虽然里面有一些`[`,但是本质上还是一个大的字符串,这就无法让我们解析输出。\n",
- "\n",
- "> 由于OpenAI于2024年1月4日停用了模型text-davinci-001,你将使用OpenAI推荐的替代模型gpt-3.5-turbo-instruct。\n",
- "\n",
- "在使用语言模型时,你可能经常会遇到来自底层 API 的问题,无论这些问题是速率限制还是停机时间。因此,当你将 LLM 应用程序转移到实际生产环境中时,防范这些问题变得越来越重要。这就是为什么我们引入了`回退(Fallbacks)`的概念。"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "127d93a1",
- "metadata": {},
- "source": [
- "### 4.1 使用早期模型格式化输出"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "id": "11e2b2e2",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 180
- },
- "id": "11e2b2e2",
- "outputId": "1294676f-7fbf-4185-ac16-49fa5d0512f5"
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/usr/local/lib/python3.10/dist-packages/langchain_core/_api/deprecation.py:117: LangChainDeprecationWarning: The class `langchain_community.llms.openai.OpenAI` was deprecated in langchain-community 0.0.10 and will be removed in 0.2.0. An updated version of the class exists in the langchain-openai package and should be used instead. To use it run `pip install -U langchain-openai` and import as `from langchain_openai import OpenAI`.\n",
- " warn_deprecated(\n"
- ]
- },
- {
- "data": {
- "application/vnd.google.colaboratory.intrinsic+json": {
- "type": "string"
- },
- "text/plain": [
- "'\\n\\n{\\n \"title\": \"春风\",\\n \"author\": \"李白\",\\n \"first_line\": \"春风又绿江南岸\",\\n \"content\": [\\n \"春风又绿江南岸\",\\n \"花开满树柳如丝\",\\n \"鸟儿欢唱天地宽\",\\n \"人间春色最宜人\"\\n ]\\n}\\n\\n{\\n \"title\": \"夜雨\",\\n \"author\": \"杜甫\",\\n \"first_line\": \"夜雨潇潇\",\\n \"content\": [\\n \"夜雨潇潇\",\\n \"孤灯照旧\",\\n \"思念如潮\",\\n \"泛滥心头\"\\n ]\\n}\\n\\n{\\n \"title\": \"山行\",\\n \"author\": \"王维\",\\n \"first_line\": \"远上寒山石径斜\",\\n \"content\": [\\n \"远上寒山石径斜\",\\n \"白云生处有人家\",\\n \"停车坐爱枫林晚\",\\n \"霜叶红于二月花\"\\n ]\\n}'"
- ]
- },
- "execution_count": 16,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "from langchain.llms import OpenAI\n",
- "import json\n",
- "\n",
- "# 使用早期的OpenAI模型\n",
- "simple_model = OpenAI(\n",
- " temperature=0,\n",
- " max_tokens=1000,\n",
- " model=\"gpt-3.5-turbo-instruct\"\n",
- ")\n",
- "simple_chain = simple_model | json.loads\n",
- "\n",
- "challenge = \"写三首诗,并以josn格式输出,每首诗必须包含:标题,作者和诗的第一句。\"\n",
- "\n",
- "simple_model.invoke(challenge)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "6a22ae8e",
- "metadata": {
- "id": "6a22ae8e"
- },
- "source": [
- "如果我们使用`simple_chain`来运行,我们就会发现出现了 json 解码错误的问题,因为返回的结果就是一个字符串,无法解析,所以下面代码就会报错。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "id": "08ee6ba5",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 330
- },
- "id": "08ee6ba5",
- "outputId": "2ef7f71f-4398-47ed-a8d8-bcaa75bd91d8"
- },
- "outputs": [
- {
- "ename": "JSONDecodeError",
- "evalue": "Extra data: line 15 column 1 (char 147)",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mJSONDecodeError\u001b[0m Traceback (most recent call last)",
- "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0msimple_chain\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minvoke\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchallenge\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
- "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/langchain_core/runnables/base.py\u001b[0m in \u001b[0;36minvoke\u001b[0;34m(self, input, config)\u001b[0m\n\u001b[1;32m 2051\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2052\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstep\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msteps\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2053\u001b[0;31m input = step.invoke(\n\u001b[0m\u001b[1;32m 2054\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2055\u001b[0m \u001b[0;31m# mark each step as a child run\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/langchain_core/runnables/base.py\u001b[0m in \u001b[0;36minvoke\u001b[0;34m(self, input, config, **kwargs)\u001b[0m\n\u001b[1;32m 3505\u001b[0m \u001b[0;34m\"\"\"Invoke this runnable synchronously.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3506\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"func\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3507\u001b[0;31m return self._call_with_config(\n\u001b[0m\u001b[1;32m 3508\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_invoke\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3509\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/langchain_core/runnables/base.py\u001b[0m in \u001b[0;36m_call_with_config\u001b[0;34m(self, func, input, config, run_type, **kwargs)\u001b[0m\n\u001b[1;32m 1244\u001b[0m output = cast(\n\u001b[1;32m 1245\u001b[0m \u001b[0mOutput\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1246\u001b[0;31m context.run(\n\u001b[0m\u001b[1;32m 1247\u001b[0m \u001b[0mcall_func_with_variable_args\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1248\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;31m# type: ignore[arg-type]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/langchain_core/runnables/config.py\u001b[0m in \u001b[0;36mcall_func_with_variable_args\u001b[0;34m(func, input, config, run_manager, **kwargs)\u001b[0m\n\u001b[1;32m 324\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrun_manager\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0maccepts_run_manager\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 325\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"run_manager\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrun_manager\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 326\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# type: ignore[call-arg]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 327\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 328\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/langchain_core/runnables/base.py\u001b[0m in \u001b[0;36m_invoke\u001b[0;34m(self, input, run_manager, config, **kwargs)\u001b[0m\n\u001b[1;32m 3381\u001b[0m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mchunk\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3382\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3383\u001b[0;31m output = call_func_with_variable_args(\n\u001b[0m\u001b[1;32m 3384\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mconfig\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrun_manager\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3385\u001b[0m )\n",
- "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/langchain_core/runnables/config.py\u001b[0m in \u001b[0;36mcall_func_with_variable_args\u001b[0;34m(func, input, config, run_manager, **kwargs)\u001b[0m\n\u001b[1;32m 324\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrun_manager\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0maccepts_run_manager\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 325\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"run_manager\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrun_manager\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 326\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# type: ignore[call-arg]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 327\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 328\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m/usr/lib/python3.10/json/__init__.py\u001b[0m in \u001b[0;36mloads\u001b[0;34m(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)\u001b[0m\n\u001b[1;32m 344\u001b[0m \u001b[0mparse_int\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mparse_float\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mand\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 345\u001b[0m parse_constant is None and object_pairs_hook is None and not kw):\n\u001b[0;32m--> 346\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_default_decoder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 347\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcls\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 348\u001b[0m \u001b[0mcls\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mJSONDecoder\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m/usr/lib/python3.10/json/decoder.py\u001b[0m in \u001b[0;36mdecode\u001b[0;34m(self, s, _w)\u001b[0m\n\u001b[1;32m 338\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_w\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 339\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 340\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mJSONDecodeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Extra data\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 341\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 342\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;31mJSONDecodeError\u001b[0m: Extra data: line 15 column 1 (char 147)"
- ]
- }
- ],
- "source": [
- "simple_chain.invoke(challenge)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "1c534366",
- "metadata": {
- "id": "1c534366"
- },
- "source": [
- "### 4.2 使用新模型格式化输出\n",
- "\n",
- "所以我们会发现早期版本的 OpenAI 模型不支持格式化的输出,所以即使使用 LangChain 并且加上了`json.load`但是还是会出现错误,但是如果我们使用新的`gpt-3.5-turbo`模型就不会出现这个问题。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "id": "f0e34ed7",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "f0e34ed7",
- "outputId": "f3ae66a2-b0c7-4557-8914-687ac20b2a08"
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'poem1': {'title': '春风',\n",
- " 'author': '李白',\n",
- " 'first_line': '春风又绿江南岸。',\n",
- " 'content': '春风又绿江南岸,明月何时照我还。'},\n",
- " 'poem2': {'title': '静夜思',\n",
- " 'author': '杜甫',\n",
- " 'first_line': '床前明月光,',\n",
- " 'content': '床前明月光,疑是地上霜。'},\n",
- " 'poem3': {'title': '登鹳雀楼',\n",
- " 'author': '王之涣',\n",
- " 'first_line': '白日依山尽,',\n",
- " 'content': '白日依山尽,黄河入海流。'}}"
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# 默认使用新的模型\n",
- "model = ChatOpenAI(temperature=0)\n",
- "chain = model | StrOutputParser() | json.loads\n",
- "\n",
- "chain.invoke(challenge)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "603a2605",
- "metadata": {
- "id": "603a2605"
- },
- "source": [
- "### 4.3 fallbacks方法\n",
- "\n",
- "那这个时候可能就会思考,有没有什么方法,在不用改变太多代码的情况下,让早期的模型也能达到格式化输出的效果,而不是写复杂的格式化输出的代码去对结果进行操作。这时候我们就可以使用`fallbacks`的方式赋予早期模型这样格式化的能力,从结果我们也可以看出,我们成功使用`fallbacks`赋予了简单模型格式化的能力。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "id": "7b1aede1",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "7b1aede1",
- "outputId": "3823946c-a875-4807-9b0d-ba2910fe584d"
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'poem1': {'title': '春风',\n",
- " 'author': '李白',\n",
- " 'first_line': '春风又绿江南岸。',\n",
- " 'content': '春风又绿江南岸,明月何时照我还。'},\n",
- " 'poem2': {'title': '静夜思',\n",
- " 'author': '杜甫',\n",
- " 'first_line': '床前明月光,',\n",
- " 'content': '床前明月光,疑是地上霜。'},\n",
- " 'poem3': {'title': '登鹳雀楼',\n",
- " 'author': '王之涣',\n",
- " 'first_line': '白日依山尽,',\n",
- " 'content': '白日依山尽,黄河入海流。'}}"
- ]
- },
- "execution_count": 19,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# 使用with_fallbacks机制\n",
- "final_chain = simple_chain.with_fallbacks([chain])\n",
- "\n",
- "# 调用final_chain的invoke方法,并传递challenge参数\n",
- "final_chain.invoke(challenge)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "bb6f0aac",
- "metadata": {},
- "source": [
- "### 4.4 fallbacks 是如何实现的?\n",
- "\n",
- "当我们调用 LLM 时,经常会出现由于底层 API 问题、速率问题或者网络问题等原因,导致不能成功运行 LLM 。在这种情况下,我们就可以使用回退这种方法来解决这个问题,具体来说,他是通过使用另一种 LLM 来代替原先的不可运行的 LLM 产生结果,请看下面例子:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "4dd3f251",
- "metadata": {},
- "outputs": [],
- "source": [
- "from langchain_core.chat_models.openai import ChatOpenAI\n",
- "from langchain_core.chat_models.anthropic import ChatAnthropic\n",
- "\n",
- "model = ChatAnthropic().with_fallbacks([ChatOpenAI()])\n",
- "model.invoke('hello')"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "1b8fb2aa",
- "metadata": {},
- "source": [
- "在这种情况下,通常会优先使用 ChatAnthropic 进行回答,但是如果调用 ChatAnthropic 失败了,会回退到使用 ChatOpenAI 模型来生成响应。如果两种 LLM 都失败了,将会回退到一种硬编码响应。硬编码的默认响应用于处理异常情况或者在无法从外部资源获取所需信息时提供一个备用选项,例如 \"Looks like our LLM providers are down. Here's a nice 🦜️ emoji for you instead.\"(看起来我们的 LLM 提供商出了问题,那么,这里有一个可爱的 🦜️ 表情符号给你。)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "438e1841",
- "metadata": {},
- "source": [
- "如果你想了解更多关于 fallbacks 的内容,请参考[官方文档](https://python.langchain.com/docs/guides/fallbacks)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "96e3229d",
- "metadata": {
- "id": "96e3229d"
- },
- "source": [
- "## 五、接口 Interface\n",
- "\n",
- "在使用LangChain中,存在许多接口,其中公开的标准接口包括:\n",
- "\n",
- "- stream:流式返回输出内容\n",
- "- invoke:输入调用chain\n",
- "- batch:在输入列表中并行调用chain\n",
- "\n",
- "这些也有相应的异步方法:\n",
- "\n",
- "- astream:异步流式返回输出内容\n",
- "- ainvoke:在输入上异步调用chain\n",
- "- abatch:在输入列表中并行异步调用chain\n",
- "\n",
- "首先我们定义给一个简单提示模板,也就是\"给我讲一个关于{主题}的短笑话\",然后定义了一个简单的链`Chain = prompt | LLM | OutputParser`。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "id": "72e228da",
- "metadata": {
- "id": "72e228da"
- },
- "outputs": [],
- "source": [
- "# 创建一个ChatPromptTemplate对象,使用模板\"给我讲一个关于{topic}的短笑话\"\n",
- "prompt = ChatPromptTemplate.from_template(\n",
- " \"给我讲一个关于{topic}的短笑话\"\n",
- ")\n",
- "\n",
- "# 创建一个ChatOpenAI模型\n",
- "model = ChatOpenAI()\n",
- "\n",
- "# 创建一个StrOutputParser对象\n",
- "output_parser = StrOutputParser()\n",
- "\n",
- "# 创建一个chain,将prompt、model和output_parser连接起来\n",
- "chain = prompt | model | output_parser"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "e8b02924",
- "metadata": {
- "id": "e8b02924"
- },
- "source": [
- "### 5.1 invoke接口\n",
- "\n",
- "接下来我们分别使用对应的接口,比如我们首先使用常规的`invoke`的调用,这个也是前面展现的方法,我们得到了对应结果。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "id": "e339d019",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 53
- },
- "id": "e339d019",
- "outputId": "6893312f-8473-411f-d175-c351532e5646"
- },
- "outputs": [
- {
- "data": {
- "application/vnd.google.colaboratory.intrinsic+json": {
- "type": "string"
- },
- "text/plain": [
- "'当熊在森林里遇到一只兔子时,他问:“兔子先生,你有没有问题?”兔子回答道:“当然,先生熊,我有一个问题。你怎么会拉这么长的尾巴?”熊听后笑了起来:“兔子先生,这不是尾巴,这是我的领带!”'"
- ]
- },
- "execution_count": 21,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "chain.invoke({\"topic\": \"熊\"})"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "2520f449",
- "metadata": {
- "id": "2520f449"
- },
- "source": [
- "### 5.2 batch接口\n",
- "\n",
- "我们再尝试使用`batch`的接口,我们会发现大模型可以返回两个问题的答案,我们会给chain一个输入的列表,列表中可以包含多个问题,最后返回多个问题的答案。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "id": "d549ac8a",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "d549ac8a",
- "outputId": "fbd34c84-26b2-4576-d7ce-aa73be98d755"
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['好的,这是一个关于熊的短笑话:\\n\\n有一天,一只熊走进了一家酒吧。他走到吧台前,对酒保说:“请给我一杯……蜂蜜啤酒。”\\n\\n酒保疑惑地看着熊,说:“对不起,我们这里没有蜂蜜啤酒。”\\n\\n熊有些失望地叹了口气,然后说:“好吧,那就给我来一杯……草莓酒吧。”\\n\\n酒保摇摇头,说:“抱歉,我们也没有草莓酒。”\\n\\n熊又叹了口气,然后说:“那请给我来一杯……蜜糖红酒吧。”\\n\\n酒保实在无法忍受了,他对熊说:“对不起,我们这里没有这些奇怪的酒,你是熊,你应该知道熊只能喝蜂蜜。”\\n\\n熊听后一愣,然后脸色一变,说:“原来你们这里没有蜂蜜啤酒,草莓酒和蜜糖红酒?那请给我来一杯……白开水吧。”',\n",
- " '有一天,一只狐狸在森林里遇到了一只兔子。狐狸笑嘻嘻地对兔子说:“喂,兔子,我有一个好消息和一个坏消息,你想先听哪个?”兔子有些好奇地问:“那就先告诉我好消息吧。”狐狸眯起眼睛说:“好消息是,你的智商比我高。”兔子高兴地跳了起来:“太好了,那坏消息是什么?”狐狸一脸轻松地说:“坏消息是,你的智商还比不过胡萝卜。”']"
- ]
- },
- "execution_count": 23,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "chain.batch([{\"topic\": \"熊\"}, {\"topic\": \"狐狸\"}])"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "307a41d2",
- "metadata": {
- "id": "307a41d2"
- },
- "source": [
- "### 5.3 stream接口\n",
- "\n",
- "接下来我们在看看`stream`接口,也就是流式输出内容,这样的功能很有必要,有时候可以免去用户等待的烦恼,让用户看到一个一个词蹦出来而不是一个空的屏幕,这样会带来更好的用户体验。"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "id": "f934e46d",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "f934e46d",
- "outputId": "872109e2-47cb-4da6-a439-eb010ed3530d"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "好\n",
- "的\n",
- ",\n",
- "这\n",
- "是\n",
- "一个\n",
- "关\n",
- "于\n",
- "熊\n",
- "的\n",
- "短\n",
- "笑\n",
- "话\n",
- ":\n",
- "\n",
- "\n",
- "有\n",
- "一\n",
- "天\n",
- ",\n",
- "一\n",
- "只\n",
- "小\n",
- "熊\n",
- "走\n",
- "进\n",
- "了\n",
- "一\n",
- "家\n",
- "酒\n",
- "吧\n",
- "。\n",
- "他\n",
- "走\n",
- "到\n",
- "吧\n",
- "台\n",
- "前\n",
- ",\n",
- "对\n",
- "酒\n",
- "保\n",
- "说\n",
- ":“\n",
- "酒\n",
- "保\n",
- ",\n",
- "给\n",
- "我\n",
- "一\n",
- "杯\n",
- "牛\n",
- "奶\n",
- "。”\n",
- "\n",
- "\n",
- "酒\n",
- "保\n",
- "惊\n",
- "讶\n",
- "地\n",
- "问\n",
- "道\n",
- ":“\n",
- "小\n",
- "熊\n",
- ",\n",
- "你\n",
- "怎\n",
- "么\n",
- "会\n",
- "来\n",
- "这\n",
- "里\n",
- "喝\n",
- "牛\n",
- "奶\n",
- "?\n",
- "”\n",
- "\n",
- "\n",
- "小\n",
- "熊\n",
- "深\n",
- "情\n",
- "地\n",
- "回\n",
- "答\n",
- ":“\n",
- "因\n",
- "为\n",
- "我的\n",
- "妈\n",
- "妈\n",
- "说\n",
- ",\n",
- "每\n",
- "当\n",
- "我\n",
- "喝\n",
- "酒\n",
- "的\n",
- "时\n",
- "候\n",
- ",\n",
- "我\n",
- "都\n",
- "会\n",
- "变\n",
- "得\n",
- "熊\n",
- "样\n",
- "!”\n",
- "\n"
- ]
- }
- ],
- "source": [
- "for t in chain.stream({\"topic\": \"熊\"}):\n",
- " print(t)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "fda98771",
- "metadata": {
- "id": "fda98771"
- },
- "source": [
- "### 5.4 异步接口\n",
- "\n",
- "我们还可以尝试异步来调用,使用`ainvoke`来调用。\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "id": "cd372fec",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 71
- },
- "id": "cd372fec",
- "outputId": "1232ac0b-7900-493c-aec6-7b99abb928d8"
- },
- "outputs": [
- {
- "data": {
- "application/vnd.google.colaboratory.intrinsic+json": {
- "type": "string"
- },
- "text/plain": [
- "'好的,以下是一个关于熊的短笑话:\\n\\n有一只熊走进了一家餐厅,他走到柜台前,对着服务员说:“我想要一杯咖啡和......嗯,一块...牛肉三明治。”\\n服务员疑惑地看着熊,然后问道:“对不起,先生,你是真的想要一块牛肉三明治吗?”\\n熊点了点头。\\n服务员又问:“那请问为什么你要来这里点餐呢?”\\n熊回答道:“因为我是个熊啊!”'"
- ]
- },
- "execution_count": 25,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "response = await chain.ainvoke({\"topic\": \"熊\"})\n",
- "response"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "926c23d0",
- "metadata": {
- "id": "926c23d0"
- },
- "source": [
- "## 六、英文提示词"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "nf-C5Y-9TR2d",
- "metadata": {
- "id": "nf-C5Y-9TR2d"
- },
- "source": [
- "**一、构建简单链**"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "id": "urymAcXkTOQ2",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 53
- },
- "id": "urymAcXkTOQ2",
- "outputId": "97d1aee3-86a0-44cb-c534-6db4ffb6a11b"
- },
- "outputs": [
- {
- "data": {
- "application/vnd.google.colaboratory.intrinsic+json": {
- "type": "string"
- },
- "text/plain": [
- "'Why did the bear bring a flashlight to the party?\\n\\nBecause he wanted to be the \"light\" of the bearbecue!'"
- ]
- },
- "execution_count": 26,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "prompt = ChatPromptTemplate.from_template(\n",
- " \"tell me a short joke about {topic}\"\n",
- ")\n",
- "model = ChatOpenAI()\n",
- "output_parser = StrOutputParser()\n",
- "\n",
- "chain = prompt | model | output_parser\n",
- "\n",
- "chain.invoke({\"topic\": \"bears\"})"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "e34b6bd6",
- "metadata": {},
- "source": [
- "**2.1 构建简单文档数据库**"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "id": "J-IpScccTHtL",
- "metadata": {
- "id": "J-IpScccTHtL"
- },
- "outputs": [],
- "source": [
- "vectorstore = DocArrayInMemorySearch.from_texts(\n",
- " [\"harrison worked at kensho\", \"bears like to eat honey\"],\n",
- " embedding=OpenAIEmbeddings()\n",
- ")\n",
- "retriever = vectorstore.as_retriever()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "id": "4nWE8nLjTJuT",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "4nWE8nLjTJuT",
- "outputId": "65742853-5146-4a1c-fb0b-d1e456532b95"
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[Document(page_content='harrison worked at kensho'),\n",
- " Document(page_content='bears like to eat honey')]"
- ]
- },
- "execution_count": 28,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "retriever.get_relevant_documents(\"where did harrison work?\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "id": "hygSW50bTXzV",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "hygSW50bTXzV",
- "outputId": "2f88f2d7-781d-41d0-c50e-fa18e0420c3d"
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[Document(page_content='bears like to eat honey'),\n",
- " Document(page_content='harrison worked at kensho')]"
- ]
- },
- "execution_count": 29,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "retriever.get_relevant_documents(\"what do bears like to eat\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "id": "-6iG51i5TZGJ",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 35
- },
- "id": "-6iG51i5TZGJ",
- "outputId": "29de378b-2151-43fc-d2fc-f6a1b8d16896"
- },
- "outputs": [
- {
- "data": {
- "application/vnd.google.colaboratory.intrinsic+json": {
- "type": "string"
- },
- "text/plain": [
- "'Harrison worked at Kensho.'"
- ]
- },
- "execution_count": 31,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "template = \"\"\"Answer the question based only on the following context:\n",
- "{context}\n",
- "\n",
- "Question: {question}\n",
- "\"\"\"\n",
- "prompt = ChatPromptTemplate.from_template(template)\n",
- "\n",
- "chain = RunnableMap({\n",
- " \"context\": lambda x: retriever.get_relevant_documents(x[\"question\"]),\n",
- " \"question\": lambda x: x[\"question\"]\n",
- "}) | prompt | model | output_parser\n",
- "\n",
- "chain.invoke({\"question\": \"where did harrison work?\"})"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "aabda125",
- "metadata": {},
- "source": [
- "**3.2 使用RunnableMap**"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 33,
- "id": "JoLBmwfETg1L",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "JoLBmwfETg1L",
- "outputId": "73127639-c19e-400c-e26e-8f68dfd93635"
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'context': [Document(page_content='harrison worked at kensho'),\n",
- " Document(page_content='bears like to eat honey')],\n",
- " 'question': 'where did harrison work?'}"
- ]
- },
- "execution_count": 33,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "inputs = RunnableMap({\n",
- " \"context\": lambda x: retriever.get_relevant_documents(x[\"question\"]),\n",
- " \"question\": lambda x: x[\"question\"]\n",
- "})\n",
- "\n",
- "inputs.invoke({\"question\": \"where did harrison work?\"})"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "HqjgfGT8Tj2Z",
- "metadata": {
- "id": "HqjgfGT8Tj2Z"
- },
- "source": [
- "**3.1 单函数绑定**"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 34,
- "id": "_XZPKC1_Th6S",
- "metadata": {
- "id": "_XZPKC1_Th6S"
- },
- "outputs": [],
- "source": [
- "functions = [\n",
- " {\n",
- " \"name\": \"weather_search\",\n",
- " \"description\": \"Search for weather given an airport code\",\n",
- " \"parameters\": {\n",
- " \"type\": \"object\",\n",
- " \"properties\": {\n",
- " \"airport_code\": {\n",
- " \"type\": \"string\",\n",
- " \"description\": \"The airport code to get the weather for\"\n",
- " },\n",
- " },\n",
- " \"required\": [\"airport_code\"]\n",
- " }\n",
- " }\n",
- " ]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 35,
- "id": "0UPE0nWDTm3T",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "0UPE0nWDTm3T",
- "outputId": "a4762623-c42f-4950-a21e-da73f7efe0c9"
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "AIMessage(content='', additional_kwargs={'function_call': {'name': 'weather_search', 'arguments': '{\\n \"airport_code\": \"SFO\"\\n}'}})"
- ]
- },
- "execution_count": 35,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "prompt = ChatPromptTemplate.from_messages(\n",
- " [\n",
- " (\"human\", \"{input}\")\n",
- " ]\n",
- ")\n",
- "model = ChatOpenAI(temperature=0).bind(functions=functions)\n",
- "\n",
- "runnable = prompt | model\n",
- "\n",
- "runnable.invoke({\"input\": \"what is the weather in sf\"})"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "bc6a3fd8",
- "metadata": {},
- "source": [
- "**3.2 多个函数绑定**"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 36,
- "id": "U2R9nKncTpy6",
- "metadata": {
- "id": "U2R9nKncTpy6"
- },
- "outputs": [],
- "source": [
- "functions = [\n",
- " {\n",
- " \"name\": \"weather_search\",\n",
- " \"description\": \"Search for weather given an airport code\",\n",
- " \"parameters\": {\n",
- " \"type\": \"object\",\n",
- " \"properties\": {\n",
- " \"airport_code\": {\n",
- " \"type\": \"string\",\n",
- " \"description\": \"The airport code to get the weather for\"\n",
- " },\n",
- " },\n",
- " \"required\": [\"airport_code\"]\n",
- " }\n",
- " },\n",
- " {\n",
- " \"name\": \"sports_search\",\n",
- " \"description\": \"Search for news of recent sport events\",\n",
- " \"parameters\": {\n",
- " \"type\": \"object\",\n",
- " \"properties\": {\n",
- " \"team_name\": {\n",
- " \"type\": \"string\",\n",
- " \"description\": \"The sports team to search for\"\n",
- " },\n",
- " },\n",
- " \"required\": [\"team_name\"]\n",
- " }\n",
- " }\n",
- " ]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 37,
- "id": "i9yES-wwTrYO",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "i9yES-wwTrYO",
- "outputId": "c4e1ae94-d087-454b-a8c1-7fab9df2b790"
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "AIMessage(content='', additional_kwargs={'function_call': {'name': 'sports_search', 'arguments': '{\\n \"team_name\": \"patriots\"\\n}'}})"
- ]
- },
- "execution_count": 37,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "model = model.bind(functions=functions)\n",
- "\n",
- "runnable = prompt | model\n",
- "\n",
- "runnable.invoke({\"input\": \"how did the patriots do yesterday?\"})"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "300a9405",
- "metadata": {},
- "source": [
- "**4.1 使用早期模型格式化输出**"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 38,
- "id": "4lECkukwT4ua",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 107
- },
- "id": "4lECkukwT4ua",
- "outputId": "88c67d19-6d26-406a-9d26-08827794dc4e"
- },
- "outputs": [
- {
- "data": {
- "application/vnd.google.colaboratory.intrinsic+json": {
- "type": "string"
- },
- "text/plain": [
- "'\\n\\n{\\n \"title\": \"Autumn Leaves\",\\n \"author\": \"Emily Dickinson\",\\n \"first_line\": \"The leaves are falling, one by one\"\\n}\\n\\n{\\n \"title\": \"The Ocean\\'s Song\",\\n \"author\": \"Pablo Neruda\",\\n \"first_line\": \"I hear the ocean\\'s song, a symphony of waves\"\\n}\\n\\n{\\n \"title\": \"A Winter\\'s Night\",\\n \"author\": \"Robert Frost\",\\n \"first_line\": \"The snow falls softly, covering the ground\"\\n}'"
- ]
- },
- "execution_count": 38,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "simple_model = OpenAI(\n",
- " temperature=0,\n",
- " max_tokens=1000,\n",
- " model=\"gpt-3.5-turbo-instruct\"\n",
- ")\n",
- "simple_chain = simple_model | json.loads\n",
- "\n",
- "challenge = \"write three poems in a json blob, where each poem is a json blob of a title, author, and first line\"\n",
- "\n",
- "simple_model.invoke(challenge)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "35c60800",
- "metadata": {},
- "source": [
- "**早期模型不支持,会出现解码错误**"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 39,
- "id": "aX0oo-25T9hE",
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 330
- },
- "id": "aX0oo-25T9hE",
- "outputId": "8425f5f7-5891-4448-d67d-be89208e05a9"
- },
- "outputs": [
- {
- "ename": "JSONDecodeError",
- "evalue": "Extra data: line 9 column 1 (char 125)",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mJSONDecodeError\u001b[0m Traceback (most recent call last)",
- "\u001b[0;32m\u001b[0m in \u001b[0;36m |