chore: update prompts

web-infra-dev · Feb 13, 2025 · 953110a · 953110a
1 parent a8c5e8f
commit 953110a
Show file tree

Hide file tree

Showing 9 changed files with 56 additions and 33 deletions.
diff --git a/packages/midscene/src/ai-model/inspect.ts b/packages/midscene/src/ai-model/inspect.ts
@@ -371,7 +371,7 @@ export async function AiAssert<
         {
           type: 'text',
           text: `
-Here is the description of the assertion. Just go ahead:
+Here is the assertion. Please tell whether it is truthy according to the screenshot.
 =====================================
 ${assertion}
 =====================================

diff --git a/packages/midscene/src/ai-model/llm-planning.ts b/packages/midscene/src/ai-model/llm-planning.ts
@@ -64,10 +64,14 @@ export async function plan(
 
   const call = callAI || callAiFn;
   const { content, usage } = await call(msgs, AIActionType.PLAN);
+  const rawResponse = JSON.stringify(content, undefined, 2);
   const planFromAI = content;
-  planFromAI.usage = usage;
-
   const actions = planFromAI?.actions || [];
+  const returnValue: PlanningAIResponse = {
+    ...planFromAI,
+    rawResponse,
+    usage,
+  };
 
   if (getAIConfigInBoolean(MIDSCENE_USE_QWEN_VL)) {
     const zoomFactorX = await qwenVLZoomFactor(size.width);
@@ -89,8 +93,12 @@ export async function plan(
       if (action.locate?.bbox) {
         action.locate.bbox[0] = Math.ceil(action.locate.bbox[0] * zoomFactorX);
         action.locate.bbox[1] = Math.ceil(action.locate.bbox[1] * zoomFactorY);
-        action.locate.bbox[2] = Math.ceil(action.locate.bbox[2] * zoomFactorX);
-        action.locate.bbox[3] = Math.ceil(action.locate.bbox[3] * zoomFactorY);
+        action.locate.bbox[2] = Math.ceil(
+          (action.locate.bbox[2] || action.locate.bbox[0] + 20) * zoomFactorX, // sometimes the bbox is not complete
+        );
+        action.locate.bbox[3] = Math.ceil(
+          (action.locate.bbox[3] || action.locate.bbox[1] + 20) * zoomFactorY,
+        );
       }
     });
   }
@@ -101,5 +109,5 @@ export async function plan(
     `Failed to plan actions: ${planFromAI.error || '(no error details)'}`,
   );
 
-  return planFromAI;
+  return returnValue;
 }
diff --git a/packages/midscene/src/ai-model/prompt/llm-locator.ts b/packages/midscene/src/ai-model/prompt/llm-locator.ts
@@ -6,7 +6,7 @@ export function systemPromptToLocateElement() {
   if (getAIConfigInBoolean(MATCH_BY_POSITION)) {
     return `
 ## Role:
-You are an expert in software page image (2D) and page element text analysis.
+You are an expert in software testing.
 
 ## Objective:
 - Identify elements in screenshots and text that match the user's description.
@@ -15,10 +15,14 @@ You are an expert in software page image (2D) and page element text analysis.
 ## Output Format:
 \`\`\`json
 {
-  "bbox": [number, number, number, number], // The bounding box of the element that matches the user's description best in the screenshot
-  "errors"?: string[] // Optional, put the error message here(if any)
+  "bbox": [number, number, number, number], 
+  "errors"?: string[]
 }
 \`\`\`
+
+Fields:
+* \`bbox\` is the bounding box of the element that matches the user's description best in the screenshot
+* \`errors\` is an optional array of error messages (if any)
 `;
   }
 

diff --git a/packages/midscene/src/ai-model/prompt/llm-planning.ts b/packages/midscene/src/ai-model/prompt/llm-planning.ts
@@ -10,8 +10,8 @@ const locatorConfig = () => {
     sample: '{"bbox": [20, 50, 200, 400], "prompt": "the search bar"}',
     wrongSample: '{"bbox": [20, 50, 200, 400]}',
     locateParam: `{
-      "bbox": [number, number, number, number], // the bounding box of the element found. It should either be the bounding box marked with a rectangle in the screenshot or the bounding box described in the description.
-      "prompt"?: string // the description of the element to find. It can only be omitted when locate is null.
+      "bbox": [number, number, number, number], // the bounding box of the element to manipulate
+      "prompt": string // the description of the element
     } | null // If it's not on the page, the LocateParam should be null`,
     sampleStepOfLocating: '',
   };
@@ -73,17 +73,17 @@ type LocateParam = {locateParam}
 
 Each action has a \`type\` and corresponding \`param\`. To be detailed:
 - type: 'Tap', tap the located element
-  * {{ locate: LocateParam, param: null }}
+  * {{ locate: {format}, param: null }}
 - type: 'Hover', move mouse over to the located element
-  * {{ locate: LocateParam, param: null }}
+  * {{ locate: {format}, param: null }}
 - type: 'Input', replace the value in the input field
-  * {{ locate: LocateParam, param: {{ value: string }} }}
+  * {{ locate: {format}, param: {{ value: string }} }}
   * \`value\` is the final required input value based on the existing input. No matter what modifications are required, just provide the final value to replace the existing input value. 
 - type: 'KeyboardPress', press a key
   * {{ param: {{ value: string }} }}
 - type: 'Scroll', scroll up or down.
   * {{ 
-      locate: LocateParam | null, 
+      locate: {format} | null, 
       param: {{ 
         direction: 'down'(default) | 'up' | 'right' | 'left', 
         scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft', 

diff --git a/packages/midscene/src/ai-model/prompt/util.ts b/packages/midscene/src/ai-model/prompt/util.ts
@@ -100,11 +100,12 @@ export const extractDataSchema: ResponseFormatJSONSchema = {
 export function systemPromptToAssert() {
   return `
 ${characteristic}
-User will give an assertion, and some information about the page. Based on the information you get, tell whether the assertion is truthy.
+User will give an assertion and a screenshot of a page. Based on the information you get, tell whether the assertion is truthy.
 
 Return in the following JSON format:
 {
-  thought: string, // string, the thought of the assertion. Should in the same language as the assertion.
+  whatIsActuallyShownInString: string, // string, according to the screenshot, what is actually shown on the page. Describe by the same language as the assertion.
+  thought: string, // string, the thought of the assertion.
   pass: true, // true or false, whether the assertion is truthy
 }
 `;

diff --git a/packages/midscene/src/ai-model/service-caller/index.ts b/packages/midscene/src/ai-model/service-caller/index.ts
@@ -218,6 +218,7 @@ export async function call(
         model,
         result.usage,
         `${Date.now() - startTime}ms`,
+        result._request_id,
       );
     assert(
       result.choices,

diff --git a/packages/midscene/src/types.ts b/packages/midscene/src/types.ts
@@ -267,6 +267,7 @@ export interface PlanningAIResponse {
   furtherPlan?: PlanningFurtherPlan | null;
   error?: string;
   usage?: AIUsageInfo;
+  rawResponse?: string;
 }
 
 export interface PlanningFurtherPlan {
@@ -361,7 +362,7 @@ export interface ExecutionTaskApply<
     param: TaskParam,
     context: ExecutorContext,
   ) => // biome-ignore lint/suspicious/noConfusingVoidType: <explanation>
-    | Promise<ExecutionTaskReturn<TaskOutput, TaskLog> | undefined | void>
+  | Promise<ExecutionTaskReturn<TaskOutput, TaskLog> | undefined | void>
     | undefined
     | void;
 }
@@ -397,6 +398,7 @@ export type ExecutionTask<
       cost?: number;
       aiCost?: number;
     };
+    usage?: AIUsageInfo;
   };
 
 export interface ExecutionDump extends DumpMeta {

diff --git a/packages/visualizer/src/component/detail-side.tsx b/packages/visualizer/src/component/detail-side.tsx
@@ -7,6 +7,7 @@ import { paramStr, typeStr } from '@midscene/web/ui-utils';
 import { RadiusSettingOutlined } from '@ant-design/icons';
 import type {
   BaseElement,
+  ExecutionTask,
   ExecutionTaskInsightAssertion,
   ExecutionTaskInsightLocate,
   ExecutionTaskPlanning,
@@ -173,13 +174,8 @@ const DetailSide = (): JSX.Element => {
     });
   };
 
-  const usageInfo = (task as ExecutionTaskInsightLocate)?.log?.dump?.taskInfo
-    ?.usage
-    ? JSON.stringify(
-        (task as ExecutionTaskInsightLocate).log!.dump!.taskInfo!.usage,
-        undefined,
-        2,
-      )
+  const usageInfo = (task as ExecutionTask)?.usage
+    ? JSON.stringify((task as ExecutionTask)?.usage, undefined, 2)
     : '';
 
   const metaKVElement = MetaKV({

diff --git a/packages/web-integration/src/common/tasks.ts b/packages/web-integration/src/common/tasks.ts
@@ -3,6 +3,7 @@ import type { WebPage } from '@/common/page';
 import type { PuppeteerWebPage } from '@/puppeteer';
 import {
   type AIElementIdResponse,
+  type AIUsageInfo,
   type DumpSubscriber,
   type ExecutionRecorderItem,
   type ExecutionTaskActionApply,
@@ -147,8 +148,10 @@ export class PageTaskExecutor {
               'No prompt or id or position or bbox to locate',
             );
             let insightDump: InsightDump | undefined;
+            let usage: AIUsageInfo | undefined;
             const dumpCollector: DumpSubscriber = (dump) => {
               insightDump = dump;
+              usage = dump?.taskInfo?.usage;
             };
             this.insight.onceDumpUpdatedFn = dumpCollector;
             const shotTime = Date.now();
@@ -217,6 +220,7 @@ export class PageTaskExecutor {
               },
               recorder: [recordItem],
               aiCost,
+              usage,
             };
           },
         };
@@ -521,12 +525,18 @@ export class PageTaskExecutor {
           });
         }
 
-        const { actions, furtherPlan, taskWillBeAccomplished, error } =
-          planResult;
-        // console.log('actions', taskWillBeAccomplished, actions, furtherPlan);
+        const {
+          actions,
+          furtherPlan,
+          // taskWillBeAccomplished,
+          error,
+          usage,
+          rawResponse,
+        } = planResult;
 
         let stopCollecting = false;
         let bboxCollected = false;
+        let planParsingError = '';
         const finalActions = actions.reduce<PlanningAction[]>(
           (acc, planningAction) => {
             if (stopCollecting) {
@@ -547,16 +557,13 @@ export class PageTaskExecutor {
               acc.push({
                 type: 'Locate',
                 locate: planningAction.locate,
-                // remove id from planning, since the result is not accurate
-                // locate: {
-                //   prompt: planningAction.locate.prompt,
-                // },
                 param: null,
                 thought: planningAction.locate.prompt,
               });
             } else if (
               ['Tap', 'Hover', 'Input'].includes(planningAction.type)
             ) {
+              planParsingError = `invalid planning response: ${JSON.stringify(planningAction)}`;
               // should include locate but get null
               stopCollecting = true;
               return acc;
@@ -569,7 +576,9 @@ export class PageTaskExecutor {
 
         assert(
           finalActions.length > 0,
-          error ? `No plan: ${error}` : 'No plans found',
+          error
+            ? `Failed to plan: ${error}`
+            : planParsingError || 'No plan found',
         );
 
         cacheGroup.saveCache({
@@ -593,6 +602,8 @@ export class PageTaskExecutor {
           },
           pageContext,
           recorder: [recordItem],
+          usage,
+          rawResponse,
         };
       },
     };