From 1996b9744034b240b615a1540328ff51f2f72589 Mon Sep 17 00:00:00 2001 From: killian <63927363+KillianLucas@users.noreply.github.com> Date: Sun, 10 Dec 2023 14:25:12 -0800 Subject: [PATCH] The New Computer Update --- .../core/computer/keyboard/keyboard.py | 39 +++++++------ interpreter/core/computer/mouse/mouse.py | 4 +- .../core/computer/terminal/languages/react.py | 57 +++++++++++++++---- .../core/computer/terminal/languages/shell.py | 1 + .../core/computer/terminal/terminal.py | 35 +++++++----- interpreter/core/core.py | 15 +++-- interpreter/core/generate_system_message.py | 7 ++- interpreter/core/llm/setup_text_llm.py | 4 +- interpreter/core/respond.py | 25 +++++++- interpreter/core/utils/scan_code.py | 30 +--------- .../start_terminal_interface.py | 31 ++++++---- .../terminal_interface/terminal_interface.py | 2 +- 12 files changed, 154 insertions(+), 96 deletions(-) diff --git a/interpreter/core/computer/keyboard/keyboard.py b/interpreter/core/computer/keyboard/keyboard.py index 5cda302624..f6475f3cdd 100644 --- a/interpreter/core/computer/keyboard/keyboard.py +++ b/interpreter/core/computer/keyboard/keyboard.py @@ -5,36 +5,43 @@ import pyautogui -pyautogui.FAILSAFE = False - class Keyboard: def write(self, text): # Split the text into words words = text.split(" ") - # Type each word - for word in words: + # Type each word with a space after it, unless it's the last word + for i, word in enumerate(words): # Type the word pyautogui.write(word) - # Add a delay after each word + # Add a space after the word if it's not the last word + if i != len(words) - 1: + pyautogui.write(" ") + # Add a delay after each word to simulate ChatGPT time.sleep(random.uniform(0.1, 0.3)) def press(self, keys): pyautogui.press(keys) def hotkey(self, *args): - if "darwin" in platform.system().lower(): - # For some reason, application focus or something, we need to do this for spotlight - # only if they passed in "command", "space" or "command", " ", or those in another order - if set(args) == {"command", " "} or set(args) == {"command", "space"}: - os.system( - """ - osascript -e 'tell application "System Events" to keystroke " " using {command down}' - """ - ) - else: - pyautogui.hotkey(*args) + modifiers = {"command", "control", "option", "shift"} + if "darwin" in platform.system().lower() and len(args) == 2: + # pyautogui.hotkey seems to not work, so we use applescript + # Determine which argument is the keystroke and which is the modifier + keystroke, modifier = args if args[0] not in modifiers else args[::-1] + + # Create the AppleScript + script = f""" + tell application "System Events" + keystroke "{keystroke}" using {modifier} + end tell + """ + + # Execute the AppleScript + os.system("osascript -e '{}'".format(script)) + else: + pyautogui.hotkey(*args) def down(self, key): pyautogui.keyDown(key) diff --git a/interpreter/core/computer/mouse/mouse.py b/interpreter/core/computer/mouse/mouse.py index 2e718fa4ef..242280369b 100644 --- a/interpreter/core/computer/mouse/mouse.py +++ b/interpreter/core/computer/mouse/mouse.py @@ -7,8 +7,6 @@ from ..utils.computer_vision import find_text_in_image -pyautogui.FAILSAFE = False - class Mouse: def __init__(self, computer): @@ -33,7 +31,7 @@ def move(self, *args, x=None, y=None, index=None, svg=None): if len(centers) > 1: if index == None: print( - f"This text ('{text}') was found multiple times on screen. Please try 'click()' again, but pass in an `index` int to identify which one you want to click. The indices have been drawn on the attached image." + f"(Message for language model) This text ('{text}') was found multiple times on screen. Please try 'click()' again, but pass in an `index` int to identify which one you want to click. The indices have been drawn on the image." ) # Show the image using matplotlib plt.imshow(np.array(bounding_box_image)) diff --git a/interpreter/core/computer/terminal/languages/react.py b/interpreter/core/computer/terminal/languages/react.py index 8ff57c9cbd..5c123a6109 100644 --- a/interpreter/core/computer/terminal/languages/react.py +++ b/interpreter/core/computer/terminal/languages/react.py @@ -1,7 +1,6 @@ -""" -Test this more— I don't think it understands the environment it's in. It tends to write "require" for example. Also make sure errors go back into it (console.log type stuff) -""" +import re +from ...utils.html_to_png_base64 import html_to_png_base64 from ..base_language import BaseLanguage template = """ @@ -27,18 +26,52 @@ """ -class HTML(BaseLanguage): - file_extension = "html" - proper_name = "React" +def is_incompatible(code): + lines = code.split("\n") + + # Check for require statements at the start of any of the first few lines + # Check for ES6 import/export statements + for line in lines[:5]: + if re.match(r"\s*require\(", line): + return True + if re.match(r"\s*import\s", line) or re.match(r"\s*export\s", line): + return True - def __init__(self, config): - super().__init__() - self.config = config + return False + + +class React(BaseLanguage): + name = "React" + file_extension = "html" + system_message = "When you execute code with `react`, your react code will be run in a script tag after being inserted into the HTML template, following the installation of React, ReactDOM, and Babel for JSX parsing. **We will handle this! Don't make an HTML file to run React, just execute `react`.**" def run(self, code): - # Everything happens in the terminal interface re: how you render HTML. - # In the future though, we should let the TUI do this but then also capture stuff like console.log errors here. + if is_incompatible(code): + yield { + "type": "console", + "format": "output", + "content": f"Error: React format not supported. {self.system_message} Therefore some things like `require` and 'import' aren't supported.", + "recipient": "assistant", + } + return code = template.replace("{insert_react_code}", code) - yield {"html": code} + yield { + "type": "console", + "format": "output", + "content": "React is being displayed on the user's machine...", + "recipient": "assistant", + } + + # User sees interactive HTML + yield {"type": "code", "format": "html", "content": code, "recipient": "user"} + + # Assistant sees image + base64 = html_to_png_base64(code) + yield { + "type": "image", + "format": "base64.png", + "content": base64, + "recipient": "assistant", + } diff --git a/interpreter/core/computer/terminal/languages/shell.py b/interpreter/core/computer/terminal/languages/shell.py index 6450bd178d..db79c885c0 100644 --- a/interpreter/core/computer/terminal/languages/shell.py +++ b/interpreter/core/computer/terminal/languages/shell.py @@ -8,6 +8,7 @@ class Shell(SubprocessLanguage): file_extension = "sh" name = "Shell" + aliases = ["bash", "sh", "zsh"] def __init__( self, diff --git a/interpreter/core/computer/terminal/terminal.py b/interpreter/core/computer/terminal/terminal.py index e363729557..8348c799d9 100644 --- a/interpreter/core/computer/terminal/terminal.py +++ b/interpreter/core/computer/terminal/terminal.py @@ -4,30 +4,35 @@ from .languages.powershell import PowerShell from .languages.python import Python from .languages.r import R +from .languages.react import React from .languages.shell import Shell -language_map = { - "python": Python, - "bash": Shell, - "shell": Shell, - "sh": Shell, - "zsh": Shell, - "javascript": JavaScript, - "html": HTML, - "applescript": AppleScript, - "r": R, - "powershell": PowerShell, -} - class Terminal: def __init__(self): - self.languages = [Python, Shell, JavaScript, HTML, AppleScript, R, PowerShell] + self.languages = [ + Python, + Shell, + JavaScript, + HTML, + AppleScript, + R, + PowerShell, + React, + ] self._active_languages = {} + def get_language(self, language): + for lang in self.languages: + if language.lower() == lang.name.lower() or ( + hasattr(lang, "aliases") and language in lang.aliases + ): + return lang + return None + def run(self, language, code): if language not in self._active_languages: - self._active_languages[language] = language_map[language]() + self._active_languages[language] = self.get_language(language)() try: yield from self._active_languages[language].run(code) except GeneratorExit: diff --git a/interpreter/core/core.py b/interpreter/core/core.py index 29921ea304..24bffcce39 100644 --- a/interpreter/core/core.py +++ b/interpreter/core/core.py @@ -124,13 +124,16 @@ def _streaming_chat(self, message=None, display=True): elif isinstance(message, list): self.messages = message + # DISABLED because I think we should just not transmit images to non-multimodal models? + # REENABLE this when multimodal becomes more common: + # Make sure we're using a model that can handle this - if not self.vision: - for message in self.messages: - if message["type"] == "image": - raise Exception( - "Use a multimodal model and set `interpreter.vision` to True to handle image messages." - ) + # if not self.vision: + # for message in self.messages: + # if message["type"] == "image": + # raise Exception( + # "Use a multimodal model and set `interpreter.vision` to True to handle image messages." + # ) # This is where it all happens! yield from self._respond_and_store() diff --git a/interpreter/core/generate_system_message.py b/interpreter/core/generate_system_message.py index 5ebd598c57..ffefa6ff78 100644 --- a/interpreter/core/generate_system_message.py +++ b/interpreter/core/generate_system_message.py @@ -27,9 +27,12 @@ def generate_system_message(interpreter): try: system_message += "\n" + get_relevant_procedures_string(interpreter) except: - raise if interpreter.debug_mode: print(traceback.format_exc()) # It's okay if they can't. This just fixes some common mistakes it makes. - return system_message + for language in interpreter.computer.terminal.languages: + if hasattr(language, "system_message"): + system_message += "\n\n" + language.system_message + + return system_message.strip() diff --git a/interpreter/core/llm/setup_text_llm.py b/interpreter/core/llm/setup_text_llm.py index 0d63fc5cf1..cfee44afd8 100644 --- a/interpreter/core/llm/setup_text_llm.py +++ b/interpreter/core/llm/setup_text_llm.py @@ -64,8 +64,10 @@ def base_llm(messages): except TypeError as e: if interpreter.vision and str(e) == "expected string or buffer": # There's just no way to use tokentrim on vision-enabled models yet. + # We instead handle this outside setup_text_llm! + if interpreter.debug_mode: - print("Couldn't token trim image messages. Error:", e) + print("Won't token trim image messages. ", e) ### DISABLED image trimming # To maintain the order of messages while simulating trimming, we will iterate through the messages diff --git a/interpreter/core/respond.py b/interpreter/core/respond.py index ad9c79d703..b1d6abdbc4 100644 --- a/interpreter/core/respond.py +++ b/interpreter/core/respond.py @@ -27,6 +27,26 @@ def respond(interpreter): messages_for_llm = interpreter.messages.copy() messages_for_llm = [system_message] + messages_for_llm + # Trim image messages if they're there + if interpreter.vision: + image_messages = [msg for msg in messages_for_llm if msg["type"] == "image"] + + if interpreter.os: + # Keep only the last image if the interpreter is running in OS mode + if len(image_messages) > 1: + for img_msg in image_messages[:-1]: + messages_for_llm.remove(img_msg) + if interpreter.debug_mode: + print("Removing image message!") + else: + # Delete all the middle ones (leave only the first and last 2 images) from messages_for_llm + if len(image_messages) > 3: + for img_msg in image_messages[1:-2]: + messages_for_llm.remove(img_msg) + if interpreter.debug_mode: + print("Removing image message!") + # Idea: we could set detail: low for the middle messages, instead of deleting them + ### RUN THE LLM ### try: @@ -74,7 +94,8 @@ def respond(interpreter): ) elif interpreter.local: raise Exception( - str(e) + "Error occurred. " + + str(e) + """ Please make sure LM Studio's local server is running by following the steps above, if you're using LM Studio (recommended). @@ -133,7 +154,7 @@ def respond(interpreter): break # don't let it import computer on os mode — we handle that! - if interpreter.os: + if interpreter.os and language == "python": code = code.replace("import computer", "") # yield each line diff --git a/interpreter/core/utils/scan_code.py b/interpreter/core/utils/scan_code.py index 49c68846cb..df8a5a5772 100644 --- a/interpreter/core/utils/scan_code.py +++ b/interpreter/core/utils/scan_code.py @@ -1,7 +1,6 @@ import os import subprocess -from ..computer.terminal.terminal import language_map from .temporary_file import cleanup_temporary_file, create_temporary_file try: @@ -11,37 +10,14 @@ pass -def get_language_file_extension(language_name): - """ - Get the file extension for a given language - """ - language = language_map[language_name.lower()] - - if language.file_extension: - return language.file_extension - else: - return language - - -def get_language_name(language_name): - """ - Get the proper name for a given language - """ - language = language_map[language_name.lower()] - - if language.name: - return language.name - else: - return language - - def scan_code(code, language, interpreter): """ Scan code with semgrep """ + language_class = interpreter.computer.terminal.get_language(language) temp_file = create_temporary_file( - code, get_language_file_extension(language), verbose=interpreter.debug_mode + code, language_class.file_extension, verbose=interpreter.debug_mode ) temp_path = os.path.dirname(temp_file) @@ -65,7 +41,7 @@ def scan_code(code, language, interpreter): ) if scan.returncode == 0: - language_name = get_language_name(language) + language_name = language_class.name print( f" {'Code Scanner: ' if interpreter.safe_mode == 'auto' else ''}No issues were found in this {language_name} code." ) diff --git a/interpreter/terminal_interface/start_terminal_interface.py b/interpreter/terminal_interface/start_terminal_interface.py index 6b9d8a0d3c..199d527fe8 100644 --- a/interpreter/terminal_interface/start_terminal_interface.py +++ b/interpreter/terminal_interface/start_terminal_interface.py @@ -303,6 +303,15 @@ def start_terminal_interface(interpreter): interpreter.max_tokens = 4096 interpreter.auto_run = True interpreter.force_task_completion = True + # This line made it use files too much + interpreter.system_message = interpreter.system_message.replace( + "If you want to send data between programming languages, save the data to a txt or json.\n", + "", + ) + interpreter.system_message = interpreter.system_message.replace( + "When a user refers to a filename, they're likely referring to an existing file in the directory you're currently executing code in.", + "The user is likely referring to something on their screen.", + ) interpreter.system_message += ( "\n\n" + """ @@ -310,41 +319,41 @@ def start_terminal_interface(interpreter): Execute code using `computer` (already imported) to control the user's computer: ```python -computer.screenshot() # Automatically runs plt.show() to show you what's on the screen, returns a PIL image in case you need it (rarely). **You almost always want to do this first! You don't know what's on the user's screen.** +computer.screenshot() # Automatically runs plt.show() to show you what's on the screen, returns a `pil_image` `in case you need it (rarely). **You almost always want to do this first! You don't know what's on the user's screen.** computer.screenshot(quadrant=1) # Get a detailed view of the upper left quadrant (you'll rarely need this, use it to examine/retry failed attempts) computer.keyboard.hotkey("space", "command") # Opens spotlight (very useful) computer.keyboard.write("hello") # .down() .up() and .press() also work (uses pyautogui) -computer.mouse.move("Text in a button") # This finds the button with that text -computer.mouse.move(x=0, y=0) # Not as accurate as click("Text")! +computer.mouse.move("Text Onscreen") # This moves the mouse to the UI element with that text. Use this **frequently** — and get creative! To mouse over a video thumbnail, you could pass the *timestamp* (which is usually written on the thumbnail) into this. +computer.mouse.move(x=500, y=500) # Not as accurate as click("Text")! computer.mouse.click() # Don't forget this! Include in the same code block # Dragging +computer.mouse.move("So I was") computer.mouse.down() -computer.mouse.move(x=100, y=100) +computer.mouse.move("and that's it!") computer.mouse.up() - -computer.clipboard.copy() -print(computer.clipboard.read()) # Returns contents of clipboard ``` For rare and complex mouse actions, consider using computer vision libraries on `pil_image` to produce a list of coordinates for the mouse to move/drag to. +If the user highlighted text in an editor, then asked you to modify it, they probably want you to `keyboard.write` it over their version of the text. + +Tasks are 100% computer-based. DO NOT simply write long messages to the user to complete tasks. You MUST put your text back into the program they're using to deliver your text! For example, overwriting some text they've highlighted with `keyboard.write`. + Use keyboard navigation when reasonably possible, but not if it involves pressing a button multiple times. The mouse is less reliable. Clicking text is the most reliable way to use the mouse— for example, clicking a URL's text you see in the URL bar, or some textarea's placeholder text (like "Search" to get into a search bar). Applescript might be best for some tasks. If you use `plt.show()`, the resulting image will be sent to you. However, if you use `PIL.Image.show()`, the resulting image will NOT be sent to you. -The user has enabled OS control. They have given you permission to execute any code to control their mouse and keyboard to complete the task. - **Include `computer.screenshot()` after a 2 second delay at the end of _every_ code block to verify your progress on the task.** -Try other methods if something seems to not work. Safari didn't work? Try Chrome! Try multiple methods before saying the task is impossible. **You can do it!** +Try multiple methods before saying the task is impossible. **You can do it!** -You are an expert computer navigator, brilliant and technical. You look closely at the screenshots to discern the state of the computer, then make the best possible decisions to complete the task. +You are an expert computer navigator, brilliant and technical. **Describe the screenshots with a lot of detail, including 1. the active app, 2. what text areas appear to be active, 3. what options you could take next.** Think carefully. """.strip() ) diff --git a/interpreter/terminal_interface/terminal_interface.py b/interpreter/terminal_interface/terminal_interface.py index b0a9fe79bc..9b30d28f94 100644 --- a/interpreter/terminal_interface/terminal_interface.py +++ b/interpreter/terminal_interface/terminal_interface.py @@ -62,7 +62,7 @@ def terminal_interface(interpreter, message): interactive = True pause_force_task_completion_loop = False - force_task_completion_message = "Proceed. If the entire task I asked for is done, say exactly 'The task is done.' If it's impossible, say 'The task is impossible.' (If I haven't provided a task, say exactly 'Let me know what you'd like to do next.') Otherwise keep going." + force_task_completion_message = """Proceed. If you want to write code, start your message with "```"! If the entire task I asked for is done, say exactly 'The task is done.' If it's impossible, say 'The task is impossible.' (If I haven't provided a task, say exactly 'Let me know what you'd like to do next.') Otherwise keep going.""" force_task_completion_responses = [ "the task is done.", "the task is impossible.",