Skip to content

Commit

Permalink
feat: replanning before locating (#167)
Browse files Browse the repository at this point in the history
  • Loading branch information
yuyutaotao authored Dec 8, 2024
1 parent 515639f commit 082e347
Show file tree
Hide file tree
Showing 120 changed files with 5,422 additions and 4,928 deletions.
57 changes: 56 additions & 1 deletion .github/workflows/ai.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ on:
branches:
- main
workflow_dispatch:
inputs:
branch:
description: 'Branch to checkout'
required: false
default: 'main'
type: string

jobs:
main:
Expand All @@ -15,12 +21,14 @@ jobs:
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
OPENAI_BASE_URL: ${{ secrets.OPENAI_BASE_URL }}
MIDSCENE_MODEL_NAME: ${{ secrets.MIDSCENE_MODEL_NAME }}
MIDSCENE_MODEL_NAME: gpt-4o-2024-08-06
MIDSCENE_DEBUG_AI_PROFILE: 1

steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
ref: ${{ github.event.inputs.branch || 'main' }}

- name: Setup pnpm
uses: pnpm/action-setup@v2
Expand Down Expand Up @@ -64,12 +72,59 @@ jobs:

- name: Run e2e tests
run: pnpm run e2e
id: e2e-tests
continue-on-error: true

- name: Upload e2e report
if: always()
uses: actions/upload-artifact@v4
with:
name: e2e-report
path: ${{ github.workspace }}/packages/web-integration/midscene_run/report
if-no-files-found: ignore

- name: Run e2e tests cache
run: pnpm run e2e:cache
id: e2e-tests-cache
continue-on-error: true

- name: Upload e2e cache report
if: always()
uses: actions/upload-artifact@v4
with:
name: e2e-cache-report
path: ${{ github.workspace }}/packages/web-integration/midscene_run/report
if-no-files-found: ignore

- name: Run e2e tests report
run: pnpm run e2e:report
id: e2e-tests-report
continue-on-error: true

- name: Upload e2e report output
if: always()
uses: actions/upload-artifact@v4
with:
name: e2e-report-output
path: ${{ github.workspace }}/packages/web-integration/midscene_run/report
if-no-files-found: ignore

- name: Run tests
run: pnpm run test:ai
id: test-ai
continue-on-error: true

- name: Upload test-ai output
if: always()
uses: actions/upload-artifact@v4
with:
name: test-ai-output
path: ${{ github.workspace }}/packages/web-integration/midscene_run/report
if-no-files-found: ignore

- name: Check if script failed
if: steps.test-ai.outcome == 'failure' || steps.e2e-tests.outcome == 'failure' || steps.e2e-tests-cache.outcome == 'failure' || steps.e2e-tests-report.outcome == 'failure'
run: exit 1



2 changes: 1 addition & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@
},
"editor.defaultFormatter": "biomejs.biome",
"editor.formatOnSave": true,
"cSpell.words": ["AITEST", "httpbin"]
"cSpell.words": ["AITEST", "aweme", "httpbin", "iconfont", "taobao"]
}
2 changes: 1 addition & 1 deletion apps/site/docs/en/automate-with-scripts-in-yaml.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ target:

# object, the strategy to wait for network idle, optional
waitForNetworkIdle:
# number, the timeout in milliseconds, 30000 for default, optional
# number, the timeout in milliseconds, 10000ms for default, optional
timeout: <ms>
# boolean, continue on network idle error, true for default
continueOnNetworkIdleError: <boolean>
Expand Down
2 changes: 1 addition & 1 deletion apps/site/docs/en/integrate-with-playwright.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ export default defineConfig({

## Step 2. extend the `test` instance

Save the following code as `./fixture.ts`;
Save the following code as `./e2e/fixture.ts`;

```typescript
import { test as base } from '@playwright/test';
Expand Down
16 changes: 14 additions & 2 deletions apps/site/docs/en/prompting-tips.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,32 @@ The natural language parameter passed to Midscene will be part of the prompt sen

Since AI has the nature of heuristic, the purpose of prompt tuning should be to obtain stable responses from the AI model across runs. In most cases, to expect a consistent response from LLM by using a good prompt is entirely feasible.

## Detailed descriptions and samples are welcome
## Use detailed descriptions and samples

Detailed descriptions and examples are always welcome.

For example:

Bad ❌: "Search 'headphone'"

Good ✅: "Find the search box (it should be along with a region switch, such as 'domestic' or 'international'), type 'headphone', and hit Enter."
Good ✅: "Click the search box (it should be along with a region switch, such as 'domestic' or 'international'), type 'headphone', and hit Enter."

Bad ❌: "Assert: food delivery service is in normal state"

Good ✅: "Assert: There is a 'food delivery service' on page, and is in normal state"

## One prompt should only do one thing

Use `.ai` each time to do one task. Although Midscene has an auto-replanning strategy, it's still preferable to keep the prompt concise. Otherwise the LLM output will likely be messy. The token cost between a long prompt and a short prompt is almost the same.

Bad ❌: "Click Login button, then click Sign up button, fill the form with '[email protected]' in the email field, 'test' in the password field, and click Sign up button"

Good ✅: Split the task into three steps:

"Click Login Button"
"Click Sign up button"
"Fill the form with '[email protected]' in the email field, 'test' in the password field, and click Sign up button"

## LLMs can NOT tell the exact number like coords or hex-style color, give it some choices

For example:
Expand Down
2 changes: 1 addition & 1 deletion apps/site/docs/zh/automate-with-scripts-in-yaml.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ target:

# 等待网络空闲的策略,可选
waitForNetworkIdle:
# 等待超时时间,可选,默认 30000
# 等待超时时间,可选,默认 10000ms
timeout: <ms>
# 是否在等待超时后继续,可选,默认 true
continueOnNetworkIdleError: <boolean>
Expand Down
2 changes: 1 addition & 1 deletion apps/site/docs/zh/integrate-with-playwright.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ export default defineConfig({

## 第二步:扩展 `test` 实例

把下方代码保存为 `./fixture.ts`;
把下方代码保存为 `./e2e/fixture.ts`;

```typescript
import { test as base } from '@playwright/test';
Expand Down
12 changes: 12 additions & 0 deletions apps/site/docs/zh/prompting-tips.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,18 @@

正确示例 ✅: "断言:界面上有个“外卖服务”的板块,并且标识着“正常”"

## 一个 Prompt (指令)只做一件事

使用 `.ai` 每次只做一件事。尽管 Midscene 有自动重规划能力,但仍应保持指令简洁。否则,LLM 的输出可能会变得混乱。指令的长度对 token 消耗的影响几乎可以忽略不计。

错误示例 ❌: "点击登录按钮,然后点击注册按钮,在表单中输入'[email protected]'作为邮箱,'test'作为密码,然后点击注册按钮"

正确示例 ✅: 将任务分解为三个步骤:

"点击登录按钮"
"点击注册按钮"
"在表单中输入'[email protected]'作为邮箱,'test'作为密码,然后点击注册按钮"

### LLM 无法准确辨别数值(比如坐标或十六进制颜色值),不妨提供一些选项

例如:
Expand Down
2 changes: 1 addition & 1 deletion packages/cli/src/tty-renderer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import type { Writable } from 'node:stream';
import { stripVTControlCharacters } from 'node:util';
import restoreCursor from 'restore-cursor';

const DEFAULT_RENDER_INTERVAL = 16;
const DEFAULT_RENDER_INTERVAL = 160;

const ESC = '\x1B[';
const CLEAR_LINE = `${ESC}K`;
Expand Down
3 changes: 2 additions & 1 deletion packages/cli/src/yaml-player.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ export const defaultUA =
export const defaultViewportWidth = 1280;
export const defaultViewportHeight = 960;
export const defaultViewportScale = process.platform === 'darwin' ? 2 : 1;
export const defaultWaitForNetworkIdleTimeout = 10 * 1000;

export function loadYamlScript(
content: string,
Expand Down Expand Up @@ -374,7 +375,7 @@ export class ScriptPlayer {
const waitForNetworkIdleTimeout =
typeof target.waitForNetworkIdle?.timeout === 'number'
? target.waitForNetworkIdle.timeout
: 30 * 1000;
: defaultWaitForNetworkIdleTimeout;
try {
if (waitForNetworkIdleTimeout > 0) {
await page.waitForNetworkIdle({
Expand Down
3 changes: 1 addition & 2 deletions packages/midscene/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,9 @@
"upgrade": "modern upgrade",
"test": "vitest --run -u",
"test:ai": "AITEST=true npm run test",
"evaluate": "npm run test:ai -- tests/ai/evaluate/inspect.test.ts",
"computer": "TEST_COMPUTER=true npm run test:ai -- tests/ai/evaluate/computer.test.ts",
"evaluate": "npm run test:ai -- tests/ai/evaluate/inspect.test.ts",
"evaluate:update": "UPDATE_AI_DATA=true npm run test:ai -- tests/ai/evaluate/inspect.test.ts",
"evaluate:plan": "PLAN_INSPECT=true npm run test:ai -- tests/ai/evaluate/inspect.test.ts",
"prepublishOnly": "npm run build"
},
"dependencies": {
Expand Down
14 changes: 14 additions & 0 deletions packages/midscene/src/action/executor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,19 @@ export class Executor {
// status of executor
status: 'init' | 'pending' | 'running' | 'completed' | 'error';

onFlushUpdate?: () => void;

constructor(
name: string,
description?: string,
tasks?: ExecutionTaskApply[],
onFlushUpdate?: () => void,
) {
this.status = tasks && tasks.length > 0 ? 'pending' : 'init';
this.name = name;
this.description = description;
this.tasks = (tasks || []).map((item) => this.markTaskAsPending(item));
this.onFlushUpdate = onFlushUpdate;
}

private markTaskAsPending(task: ExecutionTaskApply): ExecutionTask {
Expand Down Expand Up @@ -80,6 +84,13 @@ export class Executor {

while (taskIndex < this.tasks.length) {
const task = this.tasks[taskIndex];
try {
if (this.onFlushUpdate) {
this.onFlushUpdate();
}
} catch (e) {
// console.error('error in onFlushUpdate', e);
}
assert(
task.status === 'pending',
`task status should be pending, but got: ${task.status}`,
Expand Down Expand Up @@ -151,6 +162,9 @@ export class Executor {
} else {
this.status = 'error';
}
if (this.onFlushUpdate) {
await this.onFlushUpdate();
}
if (this.tasks.length) {
// return the last output
const outputIndex = Math.min(taskIndex, this.tasks.length - 1);
Expand Down
66 changes: 33 additions & 33 deletions packages/midscene/src/ai-model/automation/index.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import assert from 'node:assert';
import type { PlanningAIResponse, PlanningAction, UIContext } from '@/types';
import type { AIUsageInfo, PlanningAIResponse, UIContext } from '@/types';
import {
AIActionType,
type AIArgs,
Expand All @@ -12,21 +12,33 @@ import { describeUserPage } from '../prompt/util';
export async function plan(
userPrompt: string,
opts: {
whatHaveDone?: string;
originalPrompt?: string;
context: UIContext;
callAI?: typeof callAiFn<PlanningAIResponse>;
},
useModel?: 'coze' | 'openAI',
): Promise<{
plans: PlanningAction[];
}> {
): Promise<PlanningAIResponse> {
const { callAI, context } = opts || {};
const { screenshotBase64 } = context;
const { screenshotBase64, screenshotBase64WithElementMarker } = context;
const { description: pageDescription, elementByPosition } =
await describeUserPage(context);
let planFromAI: PlanningAIResponse | undefined;

const systemPrompt = systemPromptToTaskPlanning();

let taskBackgroundContext = '';
if (opts.originalPrompt && opts.whatHaveDone) {
taskBackgroundContext = `For your information, this is a task that some important person handed to you. Here is the original task description and what have been done after the previous actions:
=====================================
Original task description:
${opts.originalPrompt}
=====================================
What have been done:
${opts.whatHaveDone}
=====================================
`;
}
const msgs: AIArgs = [
{ role: 'system', content: systemPrompt },
{
Expand All @@ -35,59 +47,47 @@ export async function plan(
{
type: 'image_url',
image_url: {
url: screenshotBase64,
url: screenshotBase64WithElementMarker || screenshotBase64,
detail: 'high',
},
},
{
type: 'text',
text: `
pageDescription:\n
${pageDescription}
\n
Here is the description of the task. Just go ahead:
=====================================
${userPrompt}
=====================================
`,
pageDescription:\n
${pageDescription}
\n
Here is what you need to do now:
=====================================
${userPrompt}
=====================================
${taskBackgroundContext}
`.trim(),
},
]),
},
];

const call = callAI || callAiFn;
planFromAI = await call({
const { content, usage } = await call({
msgs,
AIActionType: AIActionType.PLAN,
useModel,
});

const actions = planFromAI?.actions || [];
planFromAI = content;

assert(planFromAI, "can't get planFromAI");
const actions = planFromAI?.actions || [];
assert(planFromAI, "can't get plans from AI");
assert(
actions.length > 0,
`no actions in ai plan with context: ${planFromAI}`,
);

actions.forEach((action) => {
if (action.type === 'Locate' && action.quickAnswer) {
if ('id' in action.quickAnswer) {
return;
}

if ('position' in action.quickAnswer) {
action.quickAnswer = {
...action.quickAnswer,
id: elementByPosition(action.quickAnswer.position)?.id!,
};
}
}
});

if (planFromAI.error) {
throw new Error(planFromAI.error);
}

return { plans: actions };
return planFromAI;
}
Loading

0 comments on commit 082e347

Please sign in to comment.