The New Computer Update

OpenInterpreter · Dec 10, 2023 · 1996b97 · 1996b97
1 parent aa1d2a8
commit 1996b97
Show file tree

Hide file tree

Showing 12 changed files with 154 additions and 96 deletions.
diff --git a/interpreter/core/computer/keyboard/keyboard.py b/interpreter/core/computer/keyboard/keyboard.py
@@ -5,36 +5,43 @@
 
 import pyautogui
 
-pyautogui.FAILSAFE = False
-
 
 class Keyboard:
     def write(self, text):
         # Split the text into words
         words = text.split(" ")
 
-        # Type each word
-        for word in words:
+        # Type each word with a space after it, unless it's the last word
+        for i, word in enumerate(words):
             # Type the word
             pyautogui.write(word)
-            # Add a delay after each word
+            # Add a space after the word if it's not the last word
+            if i != len(words) - 1:
+                pyautogui.write(" ")
+            # Add a delay after each word to simulate ChatGPT
             time.sleep(random.uniform(0.1, 0.3))
 
     def press(self, keys):
         pyautogui.press(keys)
 
     def hotkey(self, *args):
-        if "darwin" in platform.system().lower():
-            # For some reason, application focus or something, we need to do this for spotlight
-            # only if they passed in "command", "space" or "command", " ", or those in another order
-            if set(args) == {"command", " "} or set(args) == {"command", "space"}:
-                os.system(
-                    """
-                osascript -e 'tell application "System Events" to keystroke " " using {command down}'
-                """
-                )
-            else:
-                pyautogui.hotkey(*args)
+        modifiers = {"command", "control", "option", "shift"}
+        if "darwin" in platform.system().lower() and len(args) == 2:
+            # pyautogui.hotkey seems to not work, so we use applescript
+            # Determine which argument is the keystroke and which is the modifier
+            keystroke, modifier = args if args[0] not in modifiers else args[::-1]
+
+            # Create the AppleScript
+            script = f"""
+            tell application "System Events"
+                keystroke "{keystroke}" using {modifier}
+            end tell
+            """
+
+            # Execute the AppleScript
+            os.system("osascript -e '{}'".format(script))
+        else:
+            pyautogui.hotkey(*args)
 
     def down(self, key):
         pyautogui.keyDown(key)

diff --git a/interpreter/core/computer/mouse/mouse.py b/interpreter/core/computer/mouse/mouse.py
@@ -7,8 +7,6 @@
 
 from ..utils.computer_vision import find_text_in_image
 
-pyautogui.FAILSAFE = False
-
 
 class Mouse:
     def __init__(self, computer):
@@ -33,7 +31,7 @@ def move(self, *args, x=None, y=None, index=None, svg=None):
                 if len(centers) > 1:
                     if index == None:
                         print(
-                            f"This text ('{text}') was found multiple times on screen. Please try 'click()' again, but pass in an `index` int to identify which one you want to click. The indices have been drawn on the attached image."
+                            f"(Message for language model) This text ('{text}') was found multiple times on screen. Please try 'click()' again, but pass in an `index` int to identify which one you want to click. The indices have been drawn on the image."
                         )
                         # Show the image using matplotlib
                         plt.imshow(np.array(bounding_box_image))

diff --git a/interpreter/core/computer/terminal/languages/react.py b/interpreter/core/computer/terminal/languages/react.py
@@ -1,7 +1,6 @@
-"""
-Test this more— I don't think it understands the environment it's in. It tends to write "require" for example. Also make sure errors go back into it (console.log type stuff)
-"""
+import re
 
+from ...utils.html_to_png_base64 import html_to_png_base64
 from ..base_language import BaseLanguage
 
 template = """<!DOCTYPE html>
@@ -27,18 +26,52 @@
 </html>"""
 
 
-class HTML(BaseLanguage):
-    file_extension = "html"
-    proper_name = "React"
+def is_incompatible(code):
+    lines = code.split("\n")
+
+    # Check for require statements at the start of any of the first few lines
+    # Check for ES6 import/export statements
+    for line in lines[:5]:
+        if re.match(r"\s*require\(", line):
+            return True
+        if re.match(r"\s*import\s", line) or re.match(r"\s*export\s", line):
+            return True
 
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
+    return False
+
+
+class React(BaseLanguage):
+    name = "React"
+    file_extension = "html"
+    system_message = "When you execute code with `react`, your react code will be run in a script tag after being inserted into the HTML template, following the installation of React, ReactDOM, and Babel for JSX parsing. **We will handle this! Don't make an HTML file to run React, just execute `react`.**"
 
     def run(self, code):
-        # Everything happens in the terminal interface re: how you render HTML.
-        # In the future though, we should let the TUI do this but then also capture stuff like console.log errors here.
+        if is_incompatible(code):
+            yield {
+                "type": "console",
+                "format": "output",
+                "content": f"Error: React format not supported. {self.system_message} Therefore some things like `require` and 'import' aren't supported.",
+                "recipient": "assistant",
+            }
+            return
 
         code = template.replace("{insert_react_code}", code)
 
-        yield {"html": code}
+        yield {
+            "type": "console",
+            "format": "output",
+            "content": "React is being displayed on the user's machine...",
+            "recipient": "assistant",
+        }
+
+        # User sees interactive HTML
+        yield {"type": "code", "format": "html", "content": code, "recipient": "user"}
+
+        # Assistant sees image
+        base64 = html_to_png_base64(code)
+        yield {
+            "type": "image",
+            "format": "base64.png",
+            "content": base64,
+            "recipient": "assistant",
+        }
diff --git a/interpreter/core/computer/terminal/languages/shell.py b/interpreter/core/computer/terminal/languages/shell.py
@@ -8,6 +8,7 @@
 class Shell(SubprocessLanguage):
     file_extension = "sh"
     name = "Shell"
+    aliases = ["bash", "sh", "zsh"]
 
     def __init__(
         self,

diff --git a/interpreter/core/computer/terminal/terminal.py b/interpreter/core/computer/terminal/terminal.py
@@ -4,30 +4,35 @@
 from .languages.powershell import PowerShell
 from .languages.python import Python
 from .languages.r import R
+from .languages.react import React
 from .languages.shell import Shell
 
-language_map = {
-    "python": Python,
-    "bash": Shell,
-    "shell": Shell,
-    "sh": Shell,
-    "zsh": Shell,
-    "javascript": JavaScript,
-    "html": HTML,
-    "applescript": AppleScript,
-    "r": R,
-    "powershell": PowerShell,
-}
-
 
 class Terminal:
     def __init__(self):
-        self.languages = [Python, Shell, JavaScript, HTML, AppleScript, R, PowerShell]
+        self.languages = [
+            Python,
+            Shell,
+            JavaScript,
+            HTML,
+            AppleScript,
+            R,
+            PowerShell,
+            React,
+        ]
         self._active_languages = {}
 
+    def get_language(self, language):
+        for lang in self.languages:
+            if language.lower() == lang.name.lower() or (
+                hasattr(lang, "aliases") and language in lang.aliases
+            ):
+                return lang
+        return None
+
     def run(self, language, code):
         if language not in self._active_languages:
-            self._active_languages[language] = language_map[language]()
+            self._active_languages[language] = self.get_language(language)()
         try:
             yield from self._active_languages[language].run(code)
         except GeneratorExit:

diff --git a/interpreter/core/core.py b/interpreter/core/core.py
@@ -124,13 +124,16 @@ def _streaming_chat(self, message=None, display=True):
             elif isinstance(message, list):
                 self.messages = message
 
+            # DISABLED because I think we should just not transmit images to non-multimodal models?
+            # REENABLE this when multimodal becomes more common:
+
             # Make sure we're using a model that can handle this
-            if not self.vision:
-                for message in self.messages:
-                    if message["type"] == "image":
-                        raise Exception(
-                            "Use a multimodal model and set `interpreter.vision` to True to handle image messages."
-                        )
+            # if not self.vision:
+            #     for message in self.messages:
+            #         if message["type"] == "image":
+            #             raise Exception(
+            #                 "Use a multimodal model and set `interpreter.vision` to True to handle image messages."
+            #             )
 
             # This is where it all happens!
             yield from self._respond_and_store()

diff --git a/interpreter/core/generate_system_message.py b/interpreter/core/generate_system_message.py
@@ -27,9 +27,12 @@ def generate_system_message(interpreter):
         try:
             system_message += "\n" + get_relevant_procedures_string(interpreter)
         except:
-            raise
             if interpreter.debug_mode:
                 print(traceback.format_exc())
             # It's okay if they can't. This just fixes some common mistakes it makes.
 
-    return system_message
+    for language in interpreter.computer.terminal.languages:
+        if hasattr(language, "system_message"):
+            system_message += "\n\n" + language.system_message
+
+    return system_message.strip()
diff --git a/interpreter/core/llm/setup_text_llm.py b/interpreter/core/llm/setup_text_llm.py
@@ -64,8 +64,10 @@ def base_llm(messages):
         except TypeError as e:
             if interpreter.vision and str(e) == "expected string or buffer":
                 # There's just no way to use tokentrim on vision-enabled models yet.
+                # We instead handle this outside setup_text_llm!
+
                 if interpreter.debug_mode:
-                    print("Couldn't token trim image messages. Error:", e)
+                    print("Won't token trim image messages. ", e)
 
                 ### DISABLED image trimming
                 # To maintain the order of messages while simulating trimming, we will iterate through the messages

diff --git a/interpreter/core/respond.py b/interpreter/core/respond.py
@@ -27,6 +27,26 @@ def respond(interpreter):
         messages_for_llm = interpreter.messages.copy()
         messages_for_llm = [system_message] + messages_for_llm
 
+        # Trim image messages if they're there
+        if interpreter.vision:
+            image_messages = [msg for msg in messages_for_llm if msg["type"] == "image"]
+
+            if interpreter.os:
+                # Keep only the last image if the interpreter is running in OS mode
+                if len(image_messages) > 1:
+                    for img_msg in image_messages[:-1]:
+                        messages_for_llm.remove(img_msg)
+                        if interpreter.debug_mode:
+                            print("Removing image message!")
+            else:
+                # Delete all the middle ones (leave only the first and last 2 images) from messages_for_llm
+                if len(image_messages) > 3:
+                    for img_msg in image_messages[1:-2]:
+                        messages_for_llm.remove(img_msg)
+                        if interpreter.debug_mode:
+                            print("Removing image message!")
+                # Idea: we could set detail: low for the middle messages, instead of deleting them
+
         ### RUN THE LLM ###
 
         try:
@@ -74,7 +94,8 @@ def respond(interpreter):
                     )
             elif interpreter.local:
                 raise Exception(
-                    str(e)
+                    "Error occurred. "
+                    + str(e)
                     + """
 
 Please make sure LM Studio's local server is running by following the steps above, if you're using LM Studio (recommended).
@@ -133,7 +154,7 @@ def respond(interpreter):
                     break
 
                 # don't let it import computer on os mode — we handle that!
-                if interpreter.os:
+                if interpreter.os and language == "python":
                     code = code.replace("import computer", "")
 
                 # yield each line

diff --git a/interpreter/core/utils/scan_code.py b/interpreter/core/utils/scan_code.py
@@ -1,7 +1,6 @@
 import os
 import subprocess
 
-from ..computer.terminal.terminal import language_map
 from .temporary_file import cleanup_temporary_file, create_temporary_file
 
 try:
@@ -11,37 +10,14 @@
     pass
 
 
-def get_language_file_extension(language_name):
-    """
-    Get the file extension for a given language
-    """
-    language = language_map[language_name.lower()]
-
-    if language.file_extension:
-        return language.file_extension
-    else:
-        return language
-
-
-def get_language_name(language_name):
-    """
-    Get the proper name for a given language
-    """
-    language = language_map[language_name.lower()]
-
-    if language.name:
-        return language.name
-    else:
-        return language
-
-
 def scan_code(code, language, interpreter):
     """
     Scan code with semgrep
     """
+    language_class = interpreter.computer.terminal.get_language(language)
 
     temp_file = create_temporary_file(
-        code, get_language_file_extension(language), verbose=interpreter.debug_mode
+        code, language_class.file_extension, verbose=interpreter.debug_mode
     )
 
     temp_path = os.path.dirname(temp_file)
@@ -65,7 +41,7 @@ def scan_code(code, language, interpreter):
             )
 
         if scan.returncode == 0:
-            language_name = get_language_name(language)
+            language_name = language_class.name
             print(
                 f"  {'Code Scanner: ' if interpreter.safe_mode == 'auto' else ''}No issues were found in this {language_name} code."
             )