diff --git a/core/database.py b/core/database.py index 75aba77..5c31c05 100644 --- a/core/database.py +++ b/core/database.py @@ -21,7 +21,6 @@ import asyncio import datetime import logging -import re from typing import TYPE_CHECKING, Any, Self import aiohttp @@ -31,16 +30,18 @@ from . import utils from .models import FileModel, PasteModel +from .scanners import SecurityInfo, Services if TYPE_CHECKING: _Pool = asyncpg.Pool[asyncpg.Record] from types_.config import Github from types_.github import PostGist + from types_.scanner import ScannerSecret else: _Pool = asyncpg.Pool -DISCORD_TOKEN_REGEX: re.Pattern[str] = re.compile(r"[a-zA-Z0-9_-]{23,28}\.[a-zA-Z0-9_-]{6,7}\.[a-zA-Z0-9_-]{27,}") + LOGGER: logging.Logger = logging.getLogger(__name__) @@ -53,7 +54,7 @@ def __init__(self, *, dsn: str, session: aiohttp.ClientSession | None = None, gi self._handling_tokens = bool(self.session and github_config) if self._handling_tokens: - LOGGER.info("Will handle compromised discord info.") + LOGGER.info("Setup to handle Discord Tokens.") assert github_config # guarded by if here self._gist_token = github_config["token"] @@ -83,20 +84,15 @@ async def _token_task(self) -> None: await asyncio.sleep(self._gist_timeout) - def _handle_discord_tokens(self, *bodies: dict[str, str], paste_id: str) -> None: - formatted_bodies = "\n".join(b["content"] for b in bodies) - - tokens = list(DISCORD_TOKEN_REGEX.finditer(formatted_bodies)) - - if not tokens: + def _handle_discord_tokens(self, tokens: list[str], paste_id: str) -> None: + if not self._handling_tokens or not tokens: return LOGGER.info( "Discord bot token located and added to token bucket. Current bucket size is: %s", len(self.__tokens_bucket) ) - tokens = "\n".join([m[0] for m in tokens]) - self.__tokens_bucket[paste_id] = tokens + self.__tokens_bucket[paste_id] = "\n".join(tokens) async def _post_gist_of_tokens(self) -> None: assert self.session # guarded in caller @@ -211,8 +207,8 @@ async def create_paste(self, *, data: dict[str, Any]) -> PasteModel: """ file_query: str = """ - INSERT INTO files (parent_id, content, filename, loc, annotation) - VALUES ($1, $2, $3, $4, $5) + INSERT INTO files (parent_id, content, filename, loc, annotation, warning_positions) + VALUES ($1, $2, $3, $4, $5, $6) RETURNING * """ @@ -246,28 +242,39 @@ async def create_paste(self, *, data: dict[str, Any]) -> PasteModel: name: str = (file.get("filename") or f"file_{index}")[-CONFIG["PASTES"]["name_limit"] :] name = "_".join(name.splitlines()) - content: str = file["content"] + # Normalise newlines... + content: str = file["content"].replace("\r\n", "\n").replace("\r", "\n") loc: int = file["content"].count("\n") + 1 - annotation: str = "" - tokens = [t for t in utils.TOKEN_REGEX.findall(content) if utils.validate_discord_token(t)] - if tokens: - annotation = "Contains possibly sensitive information: Discord Token(s)" - if not password: - annotation += ", which have now been invalidated." + positions: list[int] = [] + extra: str = "" + + secrets: list[ScannerSecret] = SecurityInfo.scan_file(content) + for payload in secrets: + service: Services = payload["service"] + + extra += f"{service.value}, " + positions += [t[0] for t in payload["tokens"]] + + if not password and self._handling_tokens and service is Services.discord: + self._handle_discord_tokens(tokens=[t[1] for t in payload["tokens"]], paste_id=paste.id) + + extra = extra.removesuffix(", ") + annotation = f"Contains possibly sensitive data from: {extra}" if extra else "" row: asyncpg.Record | None = await connection.fetchrow( - file_query, paste.id, content, name, loc, annotation + file_query, + paste.id, + content, + name, + loc, + annotation, + sorted(positions), ) if row: paste.files.append(FileModel(row)) - if not password: - # if the user didn't provide a password (a public paste) - # we check for discord tokens - self._handle_discord_tokens(*data["files"], paste_id=paste.id) - return paste async def fetch_paste_security(self, *, token: str) -> PasteModel | None: diff --git a/core/models.py b/core/models.py index ec21ae3..e8dab76 100644 --- a/core/models.py +++ b/core/models.py @@ -67,6 +67,7 @@ def __init__(self, record: asyncpg.Record | dict[str, Any]) -> None: self.charcount: int = record["charcount"] self.index: int = record["file_index"] self.annotation: str = record["annotation"] + self.warning_positions: list[int] = record["warning_positions"] class PasteModel(BaseModel): diff --git a/core/scanners.py b/core/scanners.py new file mode 100644 index 0000000..5d31dc8 --- /dev/null +++ b/core/scanners.py @@ -0,0 +1,134 @@ +"""MystBin. Share code easily. + +Copyright (C) 2020-Current PythonistaGuild + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +""" + +from __future__ import annotations + +import base64 +import binascii +import enum +import logging +import re +from typing import TYPE_CHECKING, ClassVar + + +if TYPE_CHECKING: + from types_.scanner import ScannerSecret + + +logger: logging.Logger = logging.getLogger(__name__) + + +class Services(enum.Enum): + discord = "Discord" + pypi = "PyPi" + github = "GitHub" + + +class BaseScanner: + REGEX: ClassVar[re.Pattern[str]] + SERVICE: ClassVar[Services] + + @classmethod + def match(cls, content: str) -> ScannerSecret: + matches: list[tuple[int, str]] = [(m.start(0), m.group(0)) for m in cls.REGEX.finditer(content)] + + payload: ScannerSecret = { + "service": cls.SERVICE, + "tokens": matches, + } + + return payload + + +class DiscordScanner(BaseScanner): + REGEX = re.compile(r"[a-zA-Z0-9_-]{23,28}\.[a-zA-Z0-9_-]{6,7}\.[a-zA-Z0-9_-]{27,}") + SERVICE = Services.discord + + @staticmethod + def validate_discord_token(token: str) -> bool: + try: + # Just check if the first part validates as a user ID + (user_id, _, _) = token.split(".") + user_id = int(base64.b64decode(user_id + "==", validate=True)) + except (ValueError, binascii.Error): + return False + else: + return True + + @classmethod + def match(cls, content: str) -> ScannerSecret: + matches: list[tuple[int, str]] = [ + (m.start(0), m.group(0)) for m in cls.REGEX.finditer(content) if cls.validate_discord_token(m.group(0)) + ] + + payload: ScannerSecret = { + "service": cls.SERVICE, + "tokens": matches, + } + + return payload + + +class PyPiScanner(BaseScanner): + REGEX = re.compile(r"pypi-AgEIcHlwaS5vcmc[A-Za-z0-9-_]{70,}") + SERVICE = Services.pypi + + +class GitHubScanner(BaseScanner): + REGEX = re.compile(r"((ghp|gho|ghu|ghs|ghr)_[A-Za-z0-9_]{36})") + SERVICE = Services.github + + +class SecurityInfo: + __SERVICE_MAPPING: ClassVar[dict[Services, type[BaseScanner]]] = { + Services.discord: DiscordScanner, + Services.pypi: PyPiScanner, + Services.github: GitHubScanner, + } + + @classmethod + def scan_file( + cls, + file: str, + /, + *, + allowed: list[Services] | None = None, + disallowed: list[Services] | None = None, + ) -> list[ScannerSecret]: + """Scan for tokens in a given files content. + + You may pass a list of allowed or disallowed Services. + If both lists are empty (Default) all available services will be scanned. + """ + disallowed = disallowed or [] + allowed = allowed or list(Services) + + services: list[Services] = [s for s in allowed if s not in disallowed] + secrets: list[ScannerSecret] = [] + + for service in services: + scanner: type[BaseScanner] | None = cls.__SERVICE_MAPPING.get(service, None) + if not scanner: + logging.warning("The provided service %r is not a supported or a valid service.", service) + continue + + found: ScannerSecret = scanner.match(file) + if found["tokens"]: + secrets.append(found) + + return secrets diff --git a/migration.sql b/migration.sql index dfc4bd6..899c850 100644 --- a/migration.sql +++ b/migration.sql @@ -15,6 +15,7 @@ ALTER TABLE files ALTER COLUMN filename SET NOT NULL; -- always require filenam ALTER TABLE files DROP COLUMN IF EXISTS attachment; -- we don't have these anymore ALTER TABLE files ADD COLUMN IF NOT EXISTS annotation TEXT; ALTER TABLE files RENAME COLUMN index TO file_index; -- bad column name +ALTER TABLE files ADD COLUMN IF NOT EXISTS warning_positions INTEGER[]; -- New line warning positions SAVEPOINT drops; DROP TABLE IF EXISTS bans CASCADE; -- no longer needed diff --git a/schema.sql b/schema.sql index 98bb93d..3ca0776 100644 --- a/schema.sql +++ b/schema.sql @@ -20,5 +20,6 @@ CREATE TABLE IF NOT EXISTS files ( charcount INTEGER GENERATED ALWAYS AS (LENGTH(content)) STORED, file_index SERIAL NOT NULL, annotation TEXT, + warning_positions INTEGER[], PRIMARY KEY (parent_id, file_index) ); diff --git a/types_/scanner.py b/types_/scanner.py new file mode 100644 index 0000000..3b63909 --- /dev/null +++ b/types_/scanner.py @@ -0,0 +1,30 @@ +"""MystBin. Share code easily. + +Copyright (C) 2020-Current PythonistaGuild + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, TypedDict + + +if TYPE_CHECKING: + from core.scanners import Services + + +class ScannerSecret(TypedDict): + service: Services + tokens: list[tuple[int, str]] diff --git a/views/htmx.py b/views/htmx.py index e752ce9..773c761 100644 --- a/views/htmx.py +++ b/views/htmx.py @@ -55,12 +55,46 @@ def highlight_code(self, *, files: list[dict[str, Any]]) -> str: raw_url: str = f'/raw/{file["parent_id"]}' annotation: str = file["annotation"] + positions: list[int] = file.get("warning_positions", []) + original: str = file["content"] - content = bleach.clean( - file["content"].replace("{parts[0]}""" + if parts + else "" + ) + annotations: str = ( + f'❌ {annotation}{": " + extra if extra else ""}' + if annotation + else "" ) - annotations: str = f'❌ {annotation}' if annotation else "" + position: int = 0 + next_pos: int | None = positions.pop(0) if positions else None + + numbers: list[str] = [] + for n, line in enumerate(original.splitlines(), 1): + length: int = len(line) + + if next_pos is not None and position <= next_pos <= position + length: + numbers.append(f"""{n}""") + + try: + next_pos = positions.pop(0) + except IndexError: + next_pos = None + + else: + numbers.append(f"""{n}""") + + position += length + 1 + + content = bleach.clean(original.replace("\n{"".join(numbers)}\n""" html += f"""
@@ -72,7 +106,7 @@ def highlight_code(self, *, files: list[dict[str, Any]]) -> str:
{annotations} -
{content}
+
{lines}{content}
""" return html diff --git a/web/index.html b/web/index.html index 4fa4673..25f2cde 100644 --- a/web/index.html +++ b/web/index.html @@ -23,7 +23,7 @@ - + diff --git a/web/maint.html b/web/maint.html index 61261ea..945c2a6 100644 --- a/web/maint.html +++ b/web/maint.html @@ -15,7 +15,7 @@ - + diff --git a/web/password.html b/web/password.html index 5e217b3..2bcde32 100644 --- a/web/password.html +++ b/web/password.html @@ -14,18 +14,17 @@ - - + - + @@ -56,7 +55,7 @@
diff --git a/web/paste.html b/web/paste.html index 9f43ed0..29a210a 100644 --- a/web/paste.html +++ b/web/paste.html @@ -13,18 +13,17 @@ - - + - + diff --git a/web/static/packages/highlight-ln.min.js b/web/static/packages/highlight-ln.min.js deleted file mode 100644 index a5f9f20..0000000 --- a/web/static/packages/highlight-ln.min.js +++ /dev/null @@ -1 +0,0 @@ -!function(r,o){"use strict";var e,i="hljs-ln",l="hljs-ln-line",h="hljs-ln-code",s="hljs-ln-numbers",c="hljs-ln-n",m="data-line-number",a=/\r\n|\r|\n/g;function u(e){for(var n=e.toString(),t=e.anchorNode;"TD"!==t.nodeName;)t=t.parentNode;for(var r=e.focusNode;"TD"!==r.nodeName;)r=r.parentNode;var o=parseInt(t.dataset.lineNumber),a=parseInt(r.dataset.lineNumber);if(o==a)return n;var i,l=t.textContent,s=r.textContent;for(a
{6}',[l,s,c,m,h,o+n.startFrom,0{1}',[i,r])}return e}(e.innerHTML,o)}function v(e){var n=e.className;if(/hljs-/.test(n)){for(var t=g(e.innerHTML),r=0,o="";r{1}\n',[n,0