feat: replanning before locating (#167)

web-infra-dev · Dec 8, 2024 · 082e347 · 082e347
1 parent 515639f
commit 082e347
Show file tree

Hide file tree

Showing 120 changed files with 5,422 additions and 4,928 deletions.
diff --git a/.github/workflows/ai.yml b/.github/workflows/ai.yml
@@ -4,6 +4,12 @@ on:
     branches:
       - main
   workflow_dispatch:
+    inputs:
+      branch:
+        description: 'Branch to checkout'
+        required: false
+        default: 'main'
+        type: string
 
 jobs:
   main:
@@ -15,12 +21,14 @@ jobs:
     env:
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
       OPENAI_BASE_URL: ${{ secrets.OPENAI_BASE_URL }}
-      MIDSCENE_MODEL_NAME: ${{ secrets.MIDSCENE_MODEL_NAME }}
+      MIDSCENE_MODEL_NAME: gpt-4o-2024-08-06
+      MIDSCENE_DEBUG_AI_PROFILE: 1
 
     steps:
     - uses: actions/checkout@v4
       with:
         fetch-depth: 0
+        ref: ${{ github.event.inputs.branch || 'main' }}
 
     - name: Setup pnpm
       uses: pnpm/action-setup@v2
@@ -64,12 +72,59 @@ jobs:
 
     - name: Run e2e tests
       run: pnpm run e2e
+      id: e2e-tests
+      continue-on-error: true
 
+    - name: Upload e2e report
+      if: always()
+      uses: actions/upload-artifact@v4
+      with:
+        name: e2e-report
+        path: ${{ github.workspace }}/packages/web-integration/midscene_run/report
+        if-no-files-found: ignore
+
     - name: Run e2e tests cache
       run: pnpm run e2e:cache
+      id: e2e-tests-cache
+      continue-on-error: true
+
+    - name: Upload e2e cache report
+      if: always()
+      uses: actions/upload-artifact@v4
+      with:
+        name: e2e-cache-report
+        path: ${{ github.workspace }}/packages/web-integration/midscene_run/report
+        if-no-files-found: ignore
 
     - name: Run e2e tests report
       run: pnpm run e2e:report
+      id: e2e-tests-report
+      continue-on-error: true
+
+    - name: Upload e2e report output
+      if: always()
+      uses: actions/upload-artifact@v4
+      with:
+        name: e2e-report-output
+        path: ${{ github.workspace }}/packages/web-integration/midscene_run/report
+        if-no-files-found: ignore
 
     - name: Run tests
       run: pnpm run test:ai
+      id: test-ai
+      continue-on-error: true
+
+    - name: Upload test-ai output
+      if: always()
+      uses: actions/upload-artifact@v4
+      with:
+        name: test-ai-output
+        path: ${{ github.workspace }}/packages/web-integration/midscene_run/report
+        if-no-files-found: ignore
+
+    - name: Check if script failed
+      if: steps.test-ai.outcome == 'failure' || steps.e2e-tests.outcome == 'failure' || steps.e2e-tests-cache.outcome == 'failure' || steps.e2e-tests-report.outcome == 'failure'
+      run: exit 1
+
+
+
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -4,5 +4,5 @@
   },
   "editor.defaultFormatter": "biomejs.biome",
   "editor.formatOnSave": true,
-  "cSpell.words": ["AITEST", "httpbin"]
+  "cSpell.words": ["AITEST", "aweme", "httpbin", "iconfont", "taobao"]
 }
diff --git a/apps/site/docs/en/automate-with-scripts-in-yaml.mdx b/apps/site/docs/en/automate-with-scripts-in-yaml.mdx
@@ -137,7 +137,7 @@ target:
 
   # object, the strategy to wait for network idle, optional
   waitForNetworkIdle:
-    # number, the timeout in milliseconds, 30000 for default, optional
+    # number, the timeout in milliseconds, 10000ms for default, optional
     timeout: <ms>
     # boolean, continue on network idle error, true for default
     continueOnNetworkIdleError: <boolean>

diff --git a/apps/site/docs/en/integrate-with-playwright.mdx b/apps/site/docs/en/integrate-with-playwright.mdx
@@ -37,7 +37,7 @@ export default defineConfig({
 
 ## Step 2. extend the `test` instance
 
-Save the following code as `./fixture.ts`;
+Save the following code as `./e2e/fixture.ts`;
 
 ```typescript
 import { test as base } from '@playwright/test';

diff --git a/apps/site/docs/en/prompting-tips.md b/apps/site/docs/en/prompting-tips.md
@@ -6,20 +6,32 @@ The natural language parameter passed to Midscene will be part of the prompt sen
 
 Since AI has the nature of heuristic, the purpose of prompt tuning should be to obtain stable responses from the AI model across runs. In most cases, to expect a consistent response from LLM by using a good prompt is entirely feasible.
 
-## Detailed descriptions and samples are welcome
+## Use detailed descriptions and samples
 
 Detailed descriptions and examples are always welcome.
 
 For example: 
 
 Bad ❌: "Search 'headphone'"
 
-Good ✅: "Find the search box (it should be along with a region switch, such as 'domestic' or 'international'), type 'headphone', and hit Enter."
+Good ✅: "Click the search box (it should be along with a region switch, such as 'domestic' or 'international'), type 'headphone', and hit Enter."
 
 Bad ❌: "Assert: food delivery service is in normal state"
 
 Good ✅: "Assert: There is a 'food delivery service' on page, and is in normal state"
 
+## One prompt should only do one thing
+
+Use `.ai` each time to do one task. Although Midscene has an auto-replanning strategy, it's still preferable to keep the prompt concise. Otherwise the LLM output will likely be messy. The token cost between a long prompt and a short prompt is almost the same.
+
+Bad ❌: "Click Login button, then click Sign up button, fill the form with '[email protected]' in the email field, 'test' in the password field, and click Sign up button"
+
+Good ✅: Split the task into three steps:
+
+"Click Login Button"
+"Click Sign up button"
+"Fill the form with '[email protected]' in the email field, 'test' in the password field, and click Sign up button"
+
 ## LLMs can NOT tell the exact number like coords or hex-style color, give it some choices
 
 For example:

diff --git a/apps/site/docs/zh/automate-with-scripts-in-yaml.mdx b/apps/site/docs/zh/automate-with-scripts-in-yaml.mdx
@@ -137,7 +137,7 @@ target:
 
   # 等待网络空闲的策略，可选
   waitForNetworkIdle:
-    # 等待超时时间，可选，默认 30000
+    # 等待超时时间，可选，默认 10000ms
     timeout: <ms>
     # 是否在等待超时后继续，可选，默认 true
     continueOnNetworkIdleError: <boolean>

diff --git a/apps/site/docs/zh/integrate-with-playwright.mdx b/apps/site/docs/zh/integrate-with-playwright.mdx
@@ -38,7 +38,7 @@ export default defineConfig({
 
 ## 第二步：扩展 `test` 实例
 
-把下方代码保存为 `./fixture.ts`;
+把下方代码保存为 `./e2e/fixture.ts`;
 
 ```typescript
 import { test as base } from '@playwright/test';

diff --git a/apps/site/docs/zh/prompting-tips.md b/apps/site/docs/zh/prompting-tips.md
@@ -19,6 +19,18 @@
 
 正确示例 ✅: "断言：界面上有个“外卖服务”的板块，并且标识着“正常”"
 
+## 一个 Prompt (指令)只做一件事
+
+使用 `.ai` 每次只做一件事。尽管 Midscene 有自动重规划能力，但仍应保持指令简洁。否则，LLM 的输出可能会变得混乱。指令的长度对 token 消耗的影响几乎可以忽略不计。
+
+错误示例 ❌: "点击登录按钮，然后点击注册按钮，在表单中输入'[email protected]'作为邮箱，'test'作为密码，然后点击注册按钮"
+
+正确示例 ✅: 将任务分解为三个步骤：
+
+"点击登录按钮"
+"点击注册按钮"
+"在表单中输入'[email protected]'作为邮箱，'test'作为密码，然后点击注册按钮"
+
 ### LLM 无法准确辨别数值（比如坐标或十六进制颜色值），不妨提供一些选项
 
 例如：

diff --git a/packages/cli/src/tty-renderer.ts b/packages/cli/src/tty-renderer.ts
@@ -3,7 +3,7 @@ import type { Writable } from 'node:stream';
 import { stripVTControlCharacters } from 'node:util';
 import restoreCursor from 'restore-cursor';
 
-const DEFAULT_RENDER_INTERVAL = 16;
+const DEFAULT_RENDER_INTERVAL = 160;
 
 const ESC = '\x1B[';
 const CLEAR_LINE = `${ESC}K`;

diff --git a/packages/cli/src/yaml-player.ts b/packages/cli/src/yaml-player.ts
@@ -33,6 +33,7 @@ export const defaultUA =
 export const defaultViewportWidth = 1280;
 export const defaultViewportHeight = 960;
 export const defaultViewportScale = process.platform === 'darwin' ? 2 : 1;
+export const defaultWaitForNetworkIdleTimeout = 10 * 1000;
 
 export function loadYamlScript(
   content: string,
@@ -374,7 +375,7 @@ export class ScriptPlayer {
     const waitForNetworkIdleTimeout =
       typeof target.waitForNetworkIdle?.timeout === 'number'
         ? target.waitForNetworkIdle.timeout
-        : 30 * 1000;
+        : defaultWaitForNetworkIdleTimeout;
     try {
       if (waitForNetworkIdleTimeout > 0) {
         await page.waitForNetworkIdle({

diff --git a/packages/midscene/package.json b/packages/midscene/package.json
@@ -31,10 +31,9 @@
     "upgrade": "modern upgrade",
     "test": "vitest --run -u",
     "test:ai": "AITEST=true npm run test",
-    "evaluate": "npm run test:ai -- tests/ai/evaluate/inspect.test.ts",
     "computer": "TEST_COMPUTER=true npm run test:ai -- tests/ai/evaluate/computer.test.ts",
+    "evaluate": "npm run test:ai -- tests/ai/evaluate/inspect.test.ts",
     "evaluate:update": "UPDATE_AI_DATA=true npm run test:ai -- tests/ai/evaluate/inspect.test.ts",
-    "evaluate:plan": "PLAN_INSPECT=true npm run test:ai -- tests/ai/evaluate/inspect.test.ts",
     "prepublishOnly": "npm run build"
   },
   "dependencies": {

diff --git a/packages/midscene/src/action/executor.ts b/packages/midscene/src/action/executor.ts
@@ -20,15 +20,19 @@ export class Executor {
   // status of executor
   status: 'init' | 'pending' | 'running' | 'completed' | 'error';
 
+  onFlushUpdate?: () => void;
+
   constructor(
     name: string,
     description?: string,
     tasks?: ExecutionTaskApply[],
+    onFlushUpdate?: () => void,
   ) {
     this.status = tasks && tasks.length > 0 ? 'pending' : 'init';
     this.name = name;
     this.description = description;
     this.tasks = (tasks || []).map((item) => this.markTaskAsPending(item));
+    this.onFlushUpdate = onFlushUpdate;
   }
 
   private markTaskAsPending(task: ExecutionTaskApply): ExecutionTask {
@@ -80,6 +84,13 @@ export class Executor {
 
     while (taskIndex < this.tasks.length) {
       const task = this.tasks[taskIndex];
+      try {
+        if (this.onFlushUpdate) {
+          this.onFlushUpdate();
+        }
+      } catch (e) {
+        // console.error('error in onFlushUpdate', e);
+      }
       assert(
         task.status === 'pending',
         `task status should be pending, but got: ${task.status}`,
@@ -151,6 +162,9 @@ export class Executor {
     } else {
       this.status = 'error';
     }
+    if (this.onFlushUpdate) {
+      await this.onFlushUpdate();
+    }
     if (this.tasks.length) {
       // return the last output
       const outputIndex = Math.min(taskIndex, this.tasks.length - 1);

diff --git a/packages/midscene/src/ai-model/automation/index.ts b/packages/midscene/src/ai-model/automation/index.ts
@@ -1,5 +1,5 @@
 import assert from 'node:assert';
-import type { PlanningAIResponse, PlanningAction, UIContext } from '@/types';
+import type { AIUsageInfo, PlanningAIResponse, UIContext } from '@/types';
 import {
   AIActionType,
   type AIArgs,
@@ -12,21 +12,33 @@ import { describeUserPage } from '../prompt/util';
 export async function plan(
   userPrompt: string,
   opts: {
+    whatHaveDone?: string;
+    originalPrompt?: string;
     context: UIContext;
     callAI?: typeof callAiFn<PlanningAIResponse>;
   },
   useModel?: 'coze' | 'openAI',
-): Promise<{
-  plans: PlanningAction[];
-}> {
+): Promise<PlanningAIResponse> {
   const { callAI, context } = opts || {};
-  const { screenshotBase64 } = context;
+  const { screenshotBase64, screenshotBase64WithElementMarker } = context;
   const { description: pageDescription, elementByPosition } =
     await describeUserPage(context);
   let planFromAI: PlanningAIResponse | undefined;
 
   const systemPrompt = systemPromptToTaskPlanning();
 
+  let taskBackgroundContext = '';
+  if (opts.originalPrompt && opts.whatHaveDone) {
+    taskBackgroundContext = `For your information, this is a task that some important person handed to you. Here is the original task description and what have been done after the previous actions:
+=====================================
+Original task description:
+${opts.originalPrompt}
+=====================================
+What have been done:
+${opts.whatHaveDone}
+=====================================
+`;
+  }
   const msgs: AIArgs = [
     { role: 'system', content: systemPrompt },
     {
@@ -35,59 +47,47 @@ export async function plan(
         {
           type: 'image_url',
           image_url: {
-            url: screenshotBase64,
+            url: screenshotBase64WithElementMarker || screenshotBase64,
             detail: 'high',
           },
         },
         {
           type: 'text',
           text: `
-            pageDescription:\n 
-            ${pageDescription}
-            \n
-            Here is the description of the task. Just go ahead:
-            =====================================
-            ${userPrompt}
-            =====================================
-          `,
+pageDescription:\n 
+${pageDescription}
+\n
+Here is what you need to do now:
+=====================================
+${userPrompt}
+=====================================
+
+${taskBackgroundContext}
+`.trim(),
         },
       ]),
     },
   ];
 
   const call = callAI || callAiFn;
-  planFromAI = await call({
+  const { content, usage } = await call({
     msgs,
     AIActionType: AIActionType.PLAN,
     useModel,
   });
 
-  const actions = planFromAI?.actions || [];
+  planFromAI = content;
 
-  assert(planFromAI, "can't get planFromAI");
+  const actions = planFromAI?.actions || [];
+  assert(planFromAI, "can't get plans from AI");
   assert(
     actions.length > 0,
     `no actions in ai plan with context: ${planFromAI}`,
   );
 
-  actions.forEach((action) => {
-    if (action.type === 'Locate' && action.quickAnswer) {
-      if ('id' in action.quickAnswer) {
-        return;
-      }
-
-      if ('position' in action.quickAnswer) {
-        action.quickAnswer = {
-          ...action.quickAnswer,
-          id: elementByPosition(action.quickAnswer.position)?.id!,
-        };
-      }
-    }
-  });
-
   if (planFromAI.error) {
     throw new Error(planFromAI.error);
   }
 
-  return { plans: actions };
+  return planFromAI;
 }