Skip to content

Commit

Permalink
chore: update prompts
Browse files Browse the repository at this point in the history
  • Loading branch information
yuyutaotao committed Feb 13, 2025
1 parent a8c5e8f commit 953110a
Show file tree
Hide file tree
Showing 9 changed files with 56 additions and 33 deletions.
2 changes: 1 addition & 1 deletion packages/midscene/src/ai-model/inspect.ts
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,7 @@ export async function AiAssert<
{
type: 'text',
text: `
Here is the description of the assertion. Just go ahead:
Here is the assertion. Please tell whether it is truthy according to the screenshot.
=====================================
${assertion}
=====================================
Expand Down
18 changes: 13 additions & 5 deletions packages/midscene/src/ai-model/llm-planning.ts
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,14 @@ export async function plan(

const call = callAI || callAiFn;
const { content, usage } = await call(msgs, AIActionType.PLAN);
const rawResponse = JSON.stringify(content, undefined, 2);
const planFromAI = content;
planFromAI.usage = usage;

const actions = planFromAI?.actions || [];
const returnValue: PlanningAIResponse = {
...planFromAI,
rawResponse,
usage,
};

if (getAIConfigInBoolean(MIDSCENE_USE_QWEN_VL)) {
const zoomFactorX = await qwenVLZoomFactor(size.width);
Expand All @@ -89,8 +93,12 @@ export async function plan(
if (action.locate?.bbox) {
action.locate.bbox[0] = Math.ceil(action.locate.bbox[0] * zoomFactorX);
action.locate.bbox[1] = Math.ceil(action.locate.bbox[1] * zoomFactorY);
action.locate.bbox[2] = Math.ceil(action.locate.bbox[2] * zoomFactorX);
action.locate.bbox[3] = Math.ceil(action.locate.bbox[3] * zoomFactorY);
action.locate.bbox[2] = Math.ceil(
(action.locate.bbox[2] || action.locate.bbox[0] + 20) * zoomFactorX, // sometimes the bbox is not complete
);
action.locate.bbox[3] = Math.ceil(
(action.locate.bbox[3] || action.locate.bbox[1] + 20) * zoomFactorY,
);
}
});
}
Expand All @@ -101,5 +109,5 @@ export async function plan(
`Failed to plan actions: ${planFromAI.error || '(no error details)'}`,
);

return planFromAI;
return returnValue;
}
10 changes: 7 additions & 3 deletions packages/midscene/src/ai-model/prompt/llm-locator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ export function systemPromptToLocateElement() {
if (getAIConfigInBoolean(MATCH_BY_POSITION)) {
return `
## Role:
You are an expert in software page image (2D) and page element text analysis.
You are an expert in software testing.
## Objective:
- Identify elements in screenshots and text that match the user's description.
Expand All @@ -15,10 +15,14 @@ You are an expert in software page image (2D) and page element text analysis.
## Output Format:
\`\`\`json
{
"bbox": [number, number, number, number], // The bounding box of the element that matches the user's description best in the screenshot
"errors"?: string[] // Optional, put the error message here(if any)
"bbox": [number, number, number, number],
"errors"?: string[]
}
\`\`\`
Fields:
* \`bbox\` is the bounding box of the element that matches the user's description best in the screenshot
* \`errors\` is an optional array of error messages (if any)
`;
}

Expand Down
12 changes: 6 additions & 6 deletions packages/midscene/src/ai-model/prompt/llm-planning.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ const locatorConfig = () => {
sample: '{"bbox": [20, 50, 200, 400], "prompt": "the search bar"}',
wrongSample: '{"bbox": [20, 50, 200, 400]}',
locateParam: `{
"bbox": [number, number, number, number], // the bounding box of the element found. It should either be the bounding box marked with a rectangle in the screenshot or the bounding box described in the description.
"prompt"?: string // the description of the element to find. It can only be omitted when locate is null.
"bbox": [number, number, number, number], // the bounding box of the element to manipulate
"prompt": string // the description of the element
} | null // If it's not on the page, the LocateParam should be null`,
sampleStepOfLocating: '',
};
Expand Down Expand Up @@ -73,17 +73,17 @@ type LocateParam = {locateParam}
Each action has a \`type\` and corresponding \`param\`. To be detailed:
- type: 'Tap', tap the located element
* {{ locate: LocateParam, param: null }}
* {{ locate: {format}, param: null }}
- type: 'Hover', move mouse over to the located element
* {{ locate: LocateParam, param: null }}
* {{ locate: {format}, param: null }}
- type: 'Input', replace the value in the input field
* {{ locate: LocateParam, param: {{ value: string }} }}
* {{ locate: {format}, param: {{ value: string }} }}
* \`value\` is the final required input value based on the existing input. No matter what modifications are required, just provide the final value to replace the existing input value.
- type: 'KeyboardPress', press a key
* {{ param: {{ value: string }} }}
- type: 'Scroll', scroll up or down.
* {{
locate: LocateParam | null,
locate: {format} | null,
param: {{
direction: 'down'(default) | 'up' | 'right' | 'left',
scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft',
Expand Down
5 changes: 3 additions & 2 deletions packages/midscene/src/ai-model/prompt/util.ts
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,12 @@ export const extractDataSchema: ResponseFormatJSONSchema = {
export function systemPromptToAssert() {
return `
${characteristic}
User will give an assertion, and some information about the page. Based on the information you get, tell whether the assertion is truthy.
User will give an assertion and a screenshot of a page. Based on the information you get, tell whether the assertion is truthy.
Return in the following JSON format:
{
thought: string, // string, the thought of the assertion. Should in the same language as the assertion.
whatIsActuallyShownInString: string, // string, according to the screenshot, what is actually shown on the page. Describe by the same language as the assertion.
thought: string, // string, the thought of the assertion.
pass: true, // true or false, whether the assertion is truthy
}
`;
Expand Down
1 change: 1 addition & 0 deletions packages/midscene/src/ai-model/service-caller/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ export async function call(
model,
result.usage,
`${Date.now() - startTime}ms`,
result._request_id,
);
assert(
result.choices,
Expand Down
4 changes: 3 additions & 1 deletion packages/midscene/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,7 @@ export interface PlanningAIResponse {
furtherPlan?: PlanningFurtherPlan | null;
error?: string;
usage?: AIUsageInfo;
rawResponse?: string;
}

export interface PlanningFurtherPlan {
Expand Down Expand Up @@ -361,7 +362,7 @@ export interface ExecutionTaskApply<
param: TaskParam,
context: ExecutorContext,
) => // biome-ignore lint/suspicious/noConfusingVoidType: <explanation>
| Promise<ExecutionTaskReturn<TaskOutput, TaskLog> | undefined | void>
| Promise<ExecutionTaskReturn<TaskOutput, TaskLog> | undefined | void>
| undefined
| void;
}
Expand Down Expand Up @@ -397,6 +398,7 @@ export type ExecutionTask<
cost?: number;
aiCost?: number;
};
usage?: AIUsageInfo;
};

export interface ExecutionDump extends DumpMeta {
Expand Down
10 changes: 3 additions & 7 deletions packages/visualizer/src/component/detail-side.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import { paramStr, typeStr } from '@midscene/web/ui-utils';
import { RadiusSettingOutlined } from '@ant-design/icons';
import type {
BaseElement,
ExecutionTask,
ExecutionTaskInsightAssertion,
ExecutionTaskInsightLocate,
ExecutionTaskPlanning,
Expand Down Expand Up @@ -173,13 +174,8 @@ const DetailSide = (): JSX.Element => {
});
};

const usageInfo = (task as ExecutionTaskInsightLocate)?.log?.dump?.taskInfo
?.usage
? JSON.stringify(
(task as ExecutionTaskInsightLocate).log!.dump!.taskInfo!.usage,
undefined,
2,
)
const usageInfo = (task as ExecutionTask)?.usage
? JSON.stringify((task as ExecutionTask)?.usage, undefined, 2)
: '';

const metaKVElement = MetaKV({
Expand Down
27 changes: 19 additions & 8 deletions packages/web-integration/src/common/tasks.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import type { WebPage } from '@/common/page';
import type { PuppeteerWebPage } from '@/puppeteer';
import {
type AIElementIdResponse,
type AIUsageInfo,
type DumpSubscriber,
type ExecutionRecorderItem,
type ExecutionTaskActionApply,
Expand Down Expand Up @@ -147,8 +148,10 @@ export class PageTaskExecutor {
'No prompt or id or position or bbox to locate',
);
let insightDump: InsightDump | undefined;
let usage: AIUsageInfo | undefined;
const dumpCollector: DumpSubscriber = (dump) => {
insightDump = dump;
usage = dump?.taskInfo?.usage;
};
this.insight.onceDumpUpdatedFn = dumpCollector;
const shotTime = Date.now();
Expand Down Expand Up @@ -217,6 +220,7 @@ export class PageTaskExecutor {
},
recorder: [recordItem],
aiCost,
usage,
};
},
};
Expand Down Expand Up @@ -521,12 +525,18 @@ export class PageTaskExecutor {
});
}

const { actions, furtherPlan, taskWillBeAccomplished, error } =
planResult;
// console.log('actions', taskWillBeAccomplished, actions, furtherPlan);
const {
actions,
furtherPlan,
// taskWillBeAccomplished,
error,
usage,
rawResponse,
} = planResult;

let stopCollecting = false;
let bboxCollected = false;
let planParsingError = '';
const finalActions = actions.reduce<PlanningAction[]>(
(acc, planningAction) => {
if (stopCollecting) {
Expand All @@ -547,16 +557,13 @@ export class PageTaskExecutor {
acc.push({
type: 'Locate',
locate: planningAction.locate,
// remove id from planning, since the result is not accurate
// locate: {
// prompt: planningAction.locate.prompt,
// },
param: null,
thought: planningAction.locate.prompt,
});
} else if (
['Tap', 'Hover', 'Input'].includes(planningAction.type)
) {
planParsingError = `invalid planning response: ${JSON.stringify(planningAction)}`;
// should include locate but get null
stopCollecting = true;
return acc;
Expand All @@ -569,7 +576,9 @@ export class PageTaskExecutor {

assert(
finalActions.length > 0,
error ? `No plan: ${error}` : 'No plans found',
error
? `Failed to plan: ${error}`
: planParsingError || 'No plan found',
);

cacheGroup.saveCache({
Expand All @@ -593,6 +602,8 @@ export class PageTaskExecutor {
},
pageContext,
recorder: [recordItem],
usage,
rawResponse,
};
},
};
Expand Down

0 comments on commit 953110a

Please sign in to comment.