From 1bbdbfed20d28daca4496371feb1fd0becb4171a Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Wed, 5 Feb 2025 17:44:41 +0100 Subject: [PATCH] Improve bot regexes (#7986) --- Tests/fixtures/bots.yml | 6 +- regexes/bots.yml | 341 ++++++++++++++++++++-------------------- 2 files changed, 171 insertions(+), 176 deletions(-) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index 8e821f2eef..844476f2f5 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -4497,7 +4497,7 @@ - user_agent: Mozilla/5.0 (compatible; seolyt/1.1; +https://seolyt.com) bot: - name: seolyt + name: SeolytBot category: Crawler url: https://seolyt.com/ - @@ -5169,9 +5169,9 @@ - user_agent: 'Mozilla/5.0 (compatible; SeolytBot/1.0.1; https://seolyt.com)' bot: - name: Seolyt Bot + name: SeolytBot category: Crawler - url: https://seolyt.com + url: https://seolyt.com/ - user_agent: LinkWalker/3.0 (http://www.brandprotect.com) bot: diff --git a/regexes/bots.yml b/regexes/bots.yml index c487d3ab7d..2ca61d3259 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -5,7 +5,7 @@ # @license http://www.gnu.org/licenses/lgpl.html LGPL v3 or later ############### -- regex: 'WireReaderBot(?:/([\d+.]+))?' +- regex: 'WireReaderBot' name: 'WireReaderBot' category: 'Feed Fetcher' url: 'https://wirereader.app/' @@ -66,7 +66,7 @@ name: 'Ahrefs Pte Ltd' url: 'https://ahrefs.com/robot' -- regex: 'AhrefsSiteAudit/[\d.]+' +- regex: 'AhrefsSiteAudit' name: 'AhrefsSiteAudit' category: 'Site Monitor' url: 'https://ahrefs.com/robot/site-audit' @@ -90,7 +90,7 @@ name: 'Alexa Internet' url: 'https://www.alexa.com' -- regex: 'Amazonbot/[\d.]+' +- regex: 'Amazonbot' name: 'Amazon Bot' category: 'Crawler' url: 'https://developer.amazon.com/support/amazonbot' @@ -98,7 +98,7 @@ name: 'Amazon.com, Inc.' url: 'https://www.amazon.com/' -- regex: 'AmazonAdBot/[\d.]+' +- regex: 'AmazonAdBot' name: 'Amazon AdBot' category: 'Crawler' url: 'https://adbot.amazon.com/' @@ -615,7 +615,7 @@ name: 'Meta Platforms, Inc.' url: 'https://www.meta.com/' -- regex: 'FacebookBot/[\d.]+' +- regex: 'FacebookBot' name: 'FacebookBot' category: 'Crawler' url: 'https://developers.facebook.com/docs/sharing/bot' @@ -663,7 +663,7 @@ name: '' url: '' -- regex: 'Fever/[0-9]' +- regex: 'Fever/' name: 'Fever' url: 'http://feedafever.com/' category: 'Feed Fetcher' @@ -937,7 +937,7 @@ category: 'Search bot' url: 'https://vuhuv.com/bot.html' -- regex: 'HTTPMon/[\d.]+' +- regex: 'HTTPMon' name: 'HTTPMon' category: 'Site Monitor' url: 'http://www.httpmon.com' @@ -981,7 +981,7 @@ name: '' url: 'https://ip-guide.com' -- regex: 'k6/[0-9.]+' +- regex: 'k6/' name: 'K6' url: 'https://k6.io/' @@ -1065,7 +1065,7 @@ name: '' url: '' -- regex: 'masscan-ng/[\d.]+' +- regex: 'masscan-ng' name: 'masscan-ng' url: 'https://github.com/bi-zone/masscan-ng' category: 'Crawler' @@ -1236,7 +1236,7 @@ name: 'NodePing' url: 'https://nodeping.com' -- regex: 'Octopus [0-9]' +- regex: 'Octopus [\d.]+' name: 'Octopus' - regex: 'OnlineOrNot\.com_bot' @@ -1308,7 +1308,7 @@ name: 'PHP Server Monitor' url: 'http://www.phpservermonitor.org/' -- regex: 'Pocket(?:ImageCache|Parser)/[\d.]+' +- regex: 'Pocket(?:ImageCache|Parser)' name: 'Pocket' category: 'Read-it-later Service' url: 'https://getpocket.com/pocketparser_ua' @@ -1470,7 +1470,7 @@ name: 'Semrush Inc.' url: 'https://www.semrush.com/' -- regex: 'SerpReputationManagementAgent/[\d.]+' +- regex: 'SerpReputationManagementAgent' name: 'Semrush Reputation Management' category: 'Service Agent' url: 'https://www.semrush.com/bot/' @@ -1486,7 +1486,7 @@ name: 'Semrush Inc.' url: 'https://www.semrush.com/' -- regex: 'SiteAuditBot/[\d.]+' +- regex: 'SiteAuditBot' name: 'SiteAuditBot' category: 'Crawler' url: 'https://www.semrush.com/bot/' @@ -1648,7 +1648,7 @@ name: 'Superfeedr' url: 'https://superfeedr.com/' -- regex: 'Sparkler/[0-9]' +- regex: 'Sparkler' name: 'Sparkler' category: 'Crawler' url: 'https://github.com/USCDataScience/sparkler' @@ -1793,7 +1793,7 @@ name: 'UkrNet Ltd' url: 'https://www.ukr.net/' -- regex: 'Uptime(?:bot)?/[\d.]+' +- regex: 'Uptime(?:bot)?/' name: 'Uptimebot' category: 'Site Monitor' url: 'https://uptime.com/uptime-bot' @@ -1949,7 +1949,7 @@ name: 'WPBeginner, LLC' url: 'https://www.wpbeginner.com/' -- regex: 'Automattic Analytics Crawler/[\d.]+' +- regex: 'Automattic Analytics Crawler' name: 'Automattic Analytics' category: 'Crawler' url: 'https://wordpress.com/crawler/' @@ -2093,7 +2093,7 @@ name: 'NetEase, Inc.' url: 'http://corp.163.com' -- regex: 'YOURLS v[0-9]' +- regex: 'YOURLS' name: 'Yourls' category: 'Crawler' url: 'http://yourls.org' @@ -2159,7 +2159,7 @@ name: 'HubPages, Inc.' url: 'https://discover.hubpages.com/' -- regex: 'Pinterest(?:bot)?/[\d.]+.*www\.pinterest\.com' +- regex: 'Pinterest(?:bot)?/.*www\.pinterest\.com' name: 'Pinterest' url: 'https://help.pinterest.com/en/business/article/pinterest-crawler' category: 'Crawler' @@ -2175,7 +2175,7 @@ name: 'Site24x7' url: 'https://www.site24x7.com' -- regex: '.* HLB/[\d.]+' +- regex: '.* HLB' name: 'Site24x7 Defacement Monitor' category: 'Site Monitor' url: 'https://support.site24x7.com/portal/en/kb/articles/default-user-agent-used-in-website-defacement-monitor' @@ -2199,7 +2199,7 @@ name: 'Snapchat Inc.' url: 'https://www.snapchat.com/' -- regex: 'SnapchatAds/[\d.]+' +- regex: 'SnapchatAds' name: 'Snapchat Ads' category: 'Crawler' url: 'https://businesshelp.snapchat.com/s/article/adsbot-crawler?language=en_US' @@ -2596,7 +2596,7 @@ name: 'PPC Labs LLC' url: 'https://www.adbeat.com/' -- regex: '(?:BuiltWith|BW)/[\d.]+' +- regex: '(?:BuiltWith|BW/)' name: 'BuiltWith' category: 'Crawler' url: 'https://builtwith.com/biup' @@ -2651,7 +2651,7 @@ category: 'Site Monitor' url: 'http://cloudsystemnetworks.com' -- regex: 'HeartRails_Capture/[\d.]+' +- regex: 'HeartRails_Capture' name: 'Heart Rails Capture' category: 'Service Agent' url: 'http://capture.heartrails.com' @@ -2664,7 +2664,7 @@ name: 'RedHunt Labs Limited' url: 'https://redhuntlabs.com/' -- regex: 'DataXu/[\d.]+' +- regex: 'DataXu' name: 'DataXu' category: 'Service Agent' url: 'https://advertising.roku.com/dataxu' @@ -2788,12 +2788,12 @@ name: 'Hatena Co., Ltd.' url: 'https://www.hatena.ne.jp' -- regex: 'RyowlEngine/[\d.]+' +- regex: 'RyowlEngine' name: 'Ryowl' category: 'Crawler' url: 'https://ryowl.org' -- regex: 'OdklBot/[\d.]+' +- regex: 'OdklBot' name: 'Odnoklassniki Bot' category: 'Crawler' url: 'https://odnoklassniki.ru' @@ -2808,7 +2808,7 @@ category: 'Crawler' url: 'https://www.zoominfo.com' -- regex: 'WeViKaBot/[\d.]+' +- regex: 'WeViKaBot' name: 'WeViKaBot' category: 'Crawler' url: 'http://www.wevika.de' @@ -2821,7 +2821,7 @@ name: 'SEOkicks' url: 'https://www.seokicks.de/' -- regex: 'Plukkie/[\d.]+' +- regex: 'Plukkie' name: 'Plukkie' category: 'Crawler' url: 'http://www.botje.com/plukkie.htm' @@ -2831,22 +2831,22 @@ category: 'Crawler' url: 'https://www.comscore.com/Web-Crawler' -- regex: 'SurdotlyBot/[\d.]+' +- regex: 'SurdotlyBot' name: 'SurdotlyBot' category: 'Crawler' url: 'http://sur.ly/bot.html' -- regex: 'Gowikibot/[\d.]+' +- regex: 'Gowikibot' name: 'Gowikibot' category: 'Crawler' url: 'http:/www.gowikibot.com' -- regex: 'SabsimBot/[\d.]+' +- regex: 'SabsimBot' name: 'SabsimBot' category: 'Crawler' url: 'https://sabsim.com' -- regex: 'LumtelBot/[\d.]+' +- regex: 'LumtelBot' name: 'LumtelBot' category: 'Crawler' url: 'https://umtel.com' @@ -2856,12 +2856,12 @@ category: 'Crawler' url: 'http://www.pipl.com/bot' -- regex: 'woobot/[\d.]+' +- regex: 'woobot' name: 'WooRank' category: 'Crawler' url: 'https://www.woorank.com/bot' -- regex: 'Cookiebot/[\d.]+' +- regex: 'Cookiebot' name: 'Cookiebot' category: 'Crawler' url: 'https://support.cookiebot.com/hc/en-us/articles/360014264140-Scanner-User-Agent' @@ -2877,7 +2877,7 @@ name: 'NET SYSTEMS RESEARCH LLC' url: 'https://www.netsystemsresearch.com/' -- regex: 'CensysInspect/[\d.]+' +- regex: 'CensysInspect' name: 'CensysInspect' category: 'Security Checker' url: 'https://about.censys.io/' @@ -2893,17 +2893,17 @@ name: 'Global Digital Network Plus, LLC' url: 'https://gdnplus.com/' -- regex: 'WellKnownBot/[\d.]+' +- regex: 'WellKnownBot' name: 'WellKnownBot' category: 'Crawler' url: 'https://well-known.dev' -- regex: 'Adsbot/[\d.]+' +- regex: 'Adsbot' name: 'Adsbot' category: 'Crawler' url: 'https://seostar.co/robot/' -- regex: 'MTRobot/[\d.]+' +- regex: 'MTRobot' name: 'MTRobot' category: 'Crawler' url: 'https://metrics-tools.de/robot.html' @@ -2911,7 +2911,7 @@ name: 'Metrics Tools' url: 'https://metrics-tools.de/' -- regex: 'serpstatbot/[\d.]+' +- regex: 'serpstatbot' name: 'serpstatbot' category: 'Crawler' url: 'http://serpstatbot.com/' @@ -2924,17 +2924,17 @@ category: 'Crawler' url: 'https://github.com/gocolly/colly/' -- regex: 'l9tcpid/v[\d.]+' +- regex: 'l9tcpid' name: 'l9tcpid' category: 'Security Checker' url: 'https://github.com/LeakIX/l9tcpid' -- regex: 'l9explore/[\d.]+' +- regex: 'l9explore' name: 'l9explore' category: 'Security Checker' url: 'https://github.com/LeakIX/l9explore' -- regex: 'l9scan/|^Lkx-.*/[\d.]+' +- regex: 'l9scan/|^Lkx-.*/' name: 'LeakIX' category: 'Security Checker' url: 'https://leakix.net/' @@ -2942,7 +2942,7 @@ name: 'BaDaaS SRL' url: 'https://leakix.net/' -- regex: 'MegaIndex\.ru/[\d.]+' +- regex: 'MegaIndex\.ru' name: 'MegaIndex' category: 'Crawler' url: 'https://megaindex.com/crawler' @@ -2955,12 +2955,12 @@ name: 'SISTRIX GmbH' url: 'https://www.sistrix.de/' -- regex: 'seolyt/[\d.]+' - name: 'seolyt' +- regex: 'Seolyt(?:Bot)?' + name: 'SeolytBot' category: 'Crawler' url: 'https://seolyt.com/' -- regex: 'YaK/[\d.]+' +- regex: 'YaK/' name: 'YaK' category: 'Crawler' url: 'https://www.linkfluence.com/' @@ -2968,7 +2968,7 @@ name: 'Linkfluence SAS' url: 'https://www.linkfluence.com/' -- regex: 'KomodiaBot/[\d.]+' +- regex: 'KomodiaBot' name: 'KomodiaBot' category: 'Crawler' url: 'http://www.komodia.com/newwiki/index.php/URL_server_crawler' @@ -2976,7 +2976,7 @@ name: 'Komodia Inc.' url: 'https://www.komodia.com/' -- regex: 'KStandBot/[\d.]+' +- regex: 'KStandBot' name: 'KStandBot' category: 'Crawler' url: 'https://url-classification.io/wiki/index.php?title=URL_server_crawler' @@ -2984,7 +2984,7 @@ name: 'Komodia Inc.' url: 'https://www.komodia.com/' -- regex: 'Neevabot/[\d.]+' +- regex: 'Neevabot' name: 'Neevabot' category: 'Search bot' url: 'https://neeva.com/neevabot' @@ -2992,17 +2992,25 @@ name: 'Neeva Inc.' url: 'https://neeva.com/' -- regex: 'LinkPreview/[\d.]+' +- regex: 'Chatwork LinkPreview' + name: 'Chatwork LinkPreview' + category: 'Service Agent' + url: 'https://go.chatwork.com/en/' + producer: + name: 'kubell Co., Ltd.' + url: 'https://www.kubell.com/en/' + +- regex: 'LinkPreview' name: 'LinkPreview' category: 'Service Agent' url: 'https://www.linkpreview.net/' -- regex: 'JungleKeyThumbnail/[\d.]+' +- regex: 'JungleKeyThumbnail' name: 'JungleKeyThumbnail' category: 'Crawler' url: 'https://junglekey.com/' -- regex: 'rocketmonitor(?: |bot/)[\d.]+' +- regex: 'rocketmonitor(?:bot)?' name: 'RocketMonitorBot' category: 'Site Monitor' url: 'https://www.radiomast.io/docs/stream-monitoring/technical_details.html' @@ -3010,7 +3018,7 @@ name: 'Radio Mast, Inc.' url: 'https://www.radiomast.io/' -- regex: 'SitemapParser-VIPnytt/[\d.]+' +- regex: 'SitemapParser-VIPnytt' name: 'SitemapParser-VIPnytt' category: 'Crawler' url: 'https://github.com/VIPnytt/SitemapParser/' @@ -3020,7 +3028,7 @@ category: 'Crawler' url: 'https://turnitin.com/robot/crawlerinfo.html' -- regex: 'DMBrowser/[\d.]+|DMBrowser-[UB]V' +- regex: 'DMBrowser|DMBrowser-[UB]V' name: 'Dotcom Monitor' category: 'Site Monitor' url: 'https://www.dotcom-monitor.com' @@ -3034,17 +3042,17 @@ category: 'Crawler' url: 'https://dataforseo.com/dataforseo-bot' -- regex: 'Discordbot/[\d.]+' +- regex: 'Discordbot' name: 'Discord Bot' category: 'Service Agent' url: 'https://discordapp.com' -- regex: 'Linespider/[\d.]+' +- regex: 'Linespider' name: 'Linespider' category: 'Crawler' url: 'https://lin.ee/4dwXkTH' -- regex: 'Cincraw/[\d.]+' +- regex: 'Cincraw' name: 'Cincraw' category: 'Crawler' url: 'http://cincrawdata.net/bot/' @@ -3078,17 +3086,17 @@ name: 'Hochschule für angewandte Wissenschaften München' url: 'https://www.hm.edu/' -- regex: 'TigerBot/[\d.]+' +- regex: 'TigerBot' name: 'TigerBot' category: 'Crawler' url: 'https://tiger.ch/' -- regex: 'TestCrawler/[\d.]+' +- regex: 'TestCrawler' name: 'TestCrawler' category: 'Crawler' url: 'https://www.comcepta.com/' -- regex: 'CrowdTanglebot/[\d.]+' +- regex: 'CrowdTanglebot' name: 'CrowdTangle' category: 'Crawler' url: 'https://help.crowdtangle.com/en/articles/3009319-crowdtangle-bot' @@ -3120,7 +3128,7 @@ name: 'deepnoc, GmbH' url: 'https://deepnoc.com/' -- regex: 'Newslitbot/[\d.]+' +- regex: 'Newslitbot' name: 'Newslitbot' category: 'Crawler' url: 'https://www.newslit.co/' @@ -3128,7 +3136,7 @@ name: 'Newslit, LLC.' url: 'https://www.newslit.co/' -- regex: 'um-(?:ANS|CC|FC|IC|LN)/[\d.]+' +- regex: 'um-(?:ANS|CC|FC|IC|LN)' name: 'uMBot' category: 'Crawler' url: 'https://www.ubermetrics-technologies.com/' @@ -3136,7 +3144,7 @@ name: 'Ubermetrics Technologies GmbH' url: 'https://www.ubermetrics-technologies.com/' -- regex: 'Abonti/[\d.]+' +- regex: 'Abonti' name: 'Abonti' category: 'Crawler' url: 'http://abonti.com/' @@ -3157,7 +3165,7 @@ name: 'Beijing Tiantexin Tech. Co., Ltd.' url: 'https://en.ipip.net/' -- regex: 'ev-crawler/[\d.]+' +- regex: 'ev-crawler' name: 'Headline' category: 'Crawler' url: 'https://headline.com/legal/crawler' @@ -3165,7 +3173,7 @@ name: 'e.ventures Managementgesellschaft mbH' url: 'https://headline.com/' -- regex: 'webprosbot/[\d.]+' +- regex: 'webprosbot' name: 'WebPros' category: 'Crawler' url: 'https://webpros.com/' @@ -3181,7 +3189,7 @@ name: 'Amazon.com, Inc.' url: 'https://www.amazon.com/' -- regex: 'Wheregoes\.com Redirect Checker/[\d.]+' +- regex: 'Wheregoes\.com Redirect Checker' name: 'WhereGoes' category: 'Crawler' url: 'https://wheregoes.com/' @@ -3191,12 +3199,12 @@ category: 'Crawler' url: 'http://66.240.192.82/' -- regex: 'InternetMeasurement/[\d.]+' +- regex: 'InternetMeasurement' name: 'InternetMeasurement' category: 'Crawler' url: 'https://internet-measurement.com/' -- regex: 'DomainAppender /[\d.]+' +- regex: 'DomainAppender' name: 'DomainAppender' category: 'Crawler' url: 'https://www.profound.net/product/domain_append/' @@ -3204,7 +3212,7 @@ name: 'Profound Networks, LLC' url: 'https://www.profound.net/' -- regex: 'FreeWebMonitoring SiteChecker/[\d.]+' +- regex: 'FreeWebMonitoring SiteChecker' name: 'FreeWebMonitoring' category: 'Site Monitor' url: 'https://www.freewebmonitoring.com/bot.html' @@ -3228,7 +3236,7 @@ name: 'Jaohawi AB' url: 'https://adstxtlab.com/' -- regex: 'Iframely/[\d.]+' +- regex: 'Iframely' name: 'Iframely' category: 'Crawler' url: 'https://iframely.com/' @@ -3236,7 +3244,7 @@ name: 'Itteco Software, Corp.' url: 'https://iframely.com/' -- regex: 'DomainStatsBot/[\d.]+' +- regex: 'DomainStatsBot' name: 'DomainStatsBot' category: 'Crawler' url: 'https://domainstats.com/pages/our-bot' @@ -3244,7 +3252,7 @@ name: 'Domainstats Ltd' url: 'https://domainstats.com/' -- regex: 'aiHitBot/[\d.]+' +- regex: 'aiHitBot' name: 'aiHitBot' category: 'Crawler' url: 'https://www.aihitdata.com/about' @@ -3262,7 +3270,7 @@ name: 'GitCrawlerBot' category: 'Crawler' -- regex: 'AdAuth/[\d.]+' +- regex: 'AdAuth' name: 'AdAuth' category: 'Crawler' url: 'https://www.adauth.com' @@ -3330,7 +3338,7 @@ name: 'Swoppen Systems GmbH' url: 'https://www.swoppen.com/de' -- regex: 'ScamadviserExternalHit/[\d.]+' +- regex: 'ScamadviserExternalHit' name: 'Scamadviser External Hit' category: 'Crawler' url: 'https://www.scamadviser.com/' @@ -3346,17 +3354,12 @@ name: 'Zaldamo, LLC.' url: 'https://www.zaldamo.com/' -- regex: 'AFB/[\d.]+' +- regex: 'AFB' name: 'Allloadin Favicon Bot' category: 'Crawler' url: 'https://allloadin.com/' -- regex: 'SeolytBot/[\d.]+' - name: 'Seolyt Bot' - category: 'Crawler' - url: 'https://seolyt.com' - -- regex: 'LinkWalker/[\d.]+' +- regex: 'LinkWalker' name: 'LinkWalker' category: 'Crawler' url: 'https://www.phishlabs.com/' @@ -3364,7 +3367,7 @@ name: 'PhishLabs, Inc.' url: 'https://www.phishlabs.com/' -- regex: 'RenovateBot/[\d.]+' +- regex: 'RenovateBot' name: 'RenovateBot' category: 'Security Checker' url: 'https://github.com/renovatebot/renovate' @@ -3372,7 +3375,7 @@ name: 'White Source Ltd.' url: 'https://www.mend.io/free-developer-tools/renovate/' -- regex: 'INETDEX-BOT/[\d.]+' +- regex: 'INETDEX-BOT' name: 'Inetdex Bot' category: 'Crawler' url: 'https://www.inetdex.com/' @@ -3401,7 +3404,7 @@ name: 'PDR Labs' url: 'https://web.archive.org/web/20220420054123/http://www.pdrlabs.net/' -- regex: 'Nicecrawler/[\d.]+' +- regex: 'Nicecrawler' name: 'NiceCrawler' category: 'Crawler' url: 'https://www.nicecrawler.com/' @@ -3409,7 +3412,7 @@ name: 'Intelium Corp.' url: 'https://www.intelium.com/' -- regex: 't3versionsBot/[\d.]+' +- regex: 't3versionsBot' name: 't3versions' category: 'Crawler' url: 'https://www.t3versions.com/bot' @@ -3417,7 +3420,7 @@ name: 'Torben Hansen' url: 'https://www.t3versions.com/' -- regex: 'Crawlson/[\d.]+' +- regex: 'Crawlson' name: 'Crawlson' category: 'Crawler' url: 'https://www.crawlson.com/about' @@ -3425,7 +3428,7 @@ name: 'Crawlson' url: 'https://www.crawlson.com/' -- regex: 'tchelebi/[\d.]+' +- regex: 'tchelebi' name: 'tchelebi' category: 'Crawler' url: 'https://tchelebi.io/' @@ -3441,7 +3444,7 @@ name: 'New Work SE' url: 'https://www.xing.com/' -- regex: 'RepoLookoutBot/v?[\d.]+' +- regex: 'RepoLookoutBot' name: 'Repo Lookout' category: 'Security Checker' url: 'https://www.repo-lookout.org/' @@ -3457,7 +3460,7 @@ name: 'MAMI Project' url: 'https://mami-project.eu/' -- regex: 'everyfeed-spider/[\d.]+' +- regex: 'everyfeed-spider' name: 'Everyfeed' url: 'https://web.archive.org/web/20050930235914/http://www.everyfeed.com/' category: 'Feed Fetcher' @@ -3481,7 +3484,7 @@ name: '' url: '' -- regex: 'Gregarius/[\d.]+' +- regex: 'Gregarius' name: 'Gregarius' category: 'Feed Fetcher' url: 'https://web.archive.org/web/20100614011837/http://devlog.gregarius.net/docs/ua/' @@ -3505,7 +3508,7 @@ name: 'Sectigo Limited' url: 'https://sectigo.com/' -- regex: 'KlarnaBot-(?:DownloadProductImage|EnrichProducts|PriceWatcher)/[\d.]+' +- regex: 'KlarnaBot-(?:DownloadProductImage|EnrichProducts|PriceWatcher)' name: 'KlarnaBot' category: 'Crawler' url: 'https://docs.klarna.com/klarna-bot/' @@ -3513,7 +3516,7 @@ name: 'Klarna Bank AB' url: 'https://www.klarna.com/' -- regex: 'Taboolabot/[\d.]+' +- regex: 'Taboolabot' name: 'Taboolabot' category: 'Crawler' url: 'https://help.taboola.com/hc/en-us/articles/115002347594-The-Taboola-Crawler' @@ -3521,7 +3524,7 @@ name: 'Taboola, Inc.' url: 'https://www.taboola.com/' -- regex: 'Asana/[\d.]+' +- regex: 'Asana' name: 'Asana' category: 'Crawler' url: 'https://asana.com/' @@ -3537,7 +3540,7 @@ name: 'Google Inc.' url: 'https://www.google.com/' -- regex: 'URLinspectorBot/[\d.]+' +- regex: 'URLinspectorBot' name: 'URLinspector' category: 'Site Monitor' url: 'https://www.urlinspector.com/bot/' @@ -3545,7 +3548,7 @@ name: 'LinkResearchTools GmbH' url: 'https://www.linkresearchtools.com/' -- regex: 'EntferBot/[\d.]+' +- regex: 'EntferBot' name: 'Entfer' category: 'Crawler' url: 'https://entfer.com/' @@ -3553,7 +3556,7 @@ name: 'Entfer Ltd.' url: 'https://entfer.com/' -- regex: 'TagInspector/[\d.]+' +- regex: 'TagInspector' name: 'Tag Inspector' category: 'Crawler' url: 'https://taginspector.com/' @@ -3577,7 +3580,7 @@ name: 'Diffbot Technologies Corp.' url: 'https://www.diffbot.com/' -- regex: 'DisqusAdstxtCrawler/[\d.]+' +- regex: 'DisqusAdstxtCrawler' name: 'Disqus' category: 'Crawler' url: 'https://help.disqus.com/en/articles/1765357-ads-txt-implementation-guide' @@ -3585,7 +3588,7 @@ name: 'Disqus, Inc.' url: 'https://disqus.com/' -- regex: 'startmebot/[\d.]+' +- regex: 'startmebot' name: 'start.me' category: 'Crawler' url: 'https://about.start.me/' @@ -3593,17 +3596,17 @@ name: 'start.me BV' url: 'https://about.start.me/' -- regex: '2ip bot/[\d.]+' +- regex: '2ip bot' name: '2ip' category: 'Crawler' url: 'https://2ip.io/' -- regex: 'ReqBin Curl Client/[\d.]+' +- regex: 'ReqBin Curl Client' name: 'ReqBin' category: 'Crawler' url: 'https://reqbin.com/curl' -- regex: 'XoviBot/[\d.]+' +- regex: 'XoviBot' name: 'XoviBot' category: 'Crawler' url: 'https://www.xovibot.net' @@ -3611,12 +3614,12 @@ name: 'Xovi GmbH' url: 'http://www.xovi.de' -- regex: 'Overcast/[\d.]+ Podcast Sync' +- regex: 'Overcast/.+Podcast Sync' name: 'Overcast Podcast Sync' category: 'Service Agent' url: 'https://overcast.fm/podcasterinfo' -- regex: '^Verity/[\d.]+' +- regex: '^Verity' name: 'GumGum Verity' category: 'Service Agent' url: 'https://gumgum.com/verity' @@ -3626,7 +3629,7 @@ category: 'Feed Reader' url: 'https://github.com/snarfed/hackermention' -- regex: 'BitSightBot/[\d.]+' +- regex: 'BitSightBot' name: 'BitSight' category: 'Security Checker' url: 'https://www.bitsight.com/' @@ -3634,7 +3637,7 @@ name: 'BitSight Technologies, Inc.' url: 'https://www.bitsight.com/' -- regex: 'Ezgif/[\d.]+' +- regex: 'Ezgif' name: 'Ezgif' category: 'Service Agent' url: 'https://ezgif.com/about' @@ -3647,7 +3650,7 @@ name: 'Kleissner Investments s.r.o.' url: 'https://intelx.io/' -- regex: 'FemtosearchBot/[\d.]+' +- regex: 'FemtosearchBot' name: 'Femtosearch' category: 'Crawler' url: 'http://femtosearch.com/' @@ -3655,7 +3658,7 @@ name: 'Grier Forensics, LLC' url: 'https://www.grierforensics.com/' -- regex: 'AdsTxtCrawler/[\d.]+' +- regex: 'AdsTxtCrawler/' name: 'AdsTxtCrawler' category: 'Crawler' url: 'https://github.com/InteractiveAdvertisingBureau/adstxtcrawler' @@ -3671,7 +3674,7 @@ name: 'Morningscore' url: 'https://morningscore.io/' -- regex: 'Uptime-Kuma/[\d.]+' +- regex: 'Uptime-Kuma' name: 'Uptime-Kuma' category: 'Site Monitor' url: 'https://github.com/louislam/uptime-kuma' @@ -3684,7 +3687,7 @@ name: 'OpenAI OpCo, LLC' url: 'https://openai.com/' -- regex: 'GPTBot/[\d.]+' +- regex: 'GPTBot' name: 'GPTBot' category: 'Crawler' url: 'https://platform.openai.com/docs/bots' @@ -3700,7 +3703,7 @@ name: 'OpenAI OpCo, LLC' url: 'https://openai.com/' -- regex: 'BrightEdge Crawler/[\d.]+' +- regex: 'BrightEdge Crawler' name: 'BrightEdge' category: 'Crawler' url: 'https://www.brightedge.com/' @@ -3708,7 +3711,7 @@ name: 'BrightEdge Technologies, Inc' url: 'https://www.brightedge.com/' -- regex: 'sfFeedReader/[\d.]+' +- regex: 'sfFeedReader' name: 'sfFeedReader' url: 'https://github.com/diem-project/sfFeed2Plugin' category: 'Feed Fetcher' @@ -3729,7 +3732,7 @@ name: 'RWTH Aachen University' url: 'https://www.comsys.rwth-aachen.de/' -- regex: 'newspaper/[\d.]+' +- regex: 'newspaper' name: 'Scraping Robot' category: 'Crawler' url: 'https://scrapingrobot.com/' @@ -3737,7 +3740,7 @@ name: 'Sprious LLC' url: 'https://sprious.com/' -- regex: 'Ant(?:\.com beta|Bot)(?:/([\d+.]+))?' +- regex: 'Ant(?:\.com beta|Bot)' name: 'Ant' category: 'Crawler' url: 'https://www.ant.com/' @@ -3745,7 +3748,7 @@ name: 'Ant.com Ltd.' url: 'https://www.ant.com/' -- regex: 'WebwikiBot/[\d.]+' +- regex: 'WebwikiBot' name: 'Webwiki' category: 'Crawler' url: 'https://www.webwiki.com/' @@ -3758,7 +3761,7 @@ category: 'Service Agent' url: 'https://www.phpmyadmin.net/' -- regex: 'Matomo/[\d.]+' +- regex: 'Matomo' name: 'Matomo' category: 'Service Agent' url: 'https://github.com/matomo-org/matomo' @@ -3766,7 +3769,7 @@ name: 'InnoCraft Ltd' url: 'https://matomo.org/' -- regex: 'Prometheus/[\d.]+' +- regex: 'Prometheus' name: 'Prometheus' category: 'Service Agent' url: 'https://github.com/prometheus/prometheus' @@ -3782,7 +3785,7 @@ name: 'ArchiveTeam' url: 'https://wiki.archiveteam.org/' -- regex: 'MADBbot/[\d.]+' +- regex: 'MADBbot' name: 'MADBbot' category: 'Crawler' url: 'https://madb.zapto.org/bot.html' @@ -3838,7 +3841,7 @@ producer: name: 'Tactikast' -- regex: 'Brightbot ([\d+.]+)' +- regex: 'Brightbot' name: 'BrightBot' category: 'Crawler' url: 'https://www.brightbot.app/' @@ -3846,7 +3849,7 @@ name: 'Bright Interactive Ltd' url: 'https://www.builtbybright.com/' -- regex: 'DaspeedBot/([\d+.]+)' +- regex: 'DaspeedBot' name: 'DaspeedBot' category: 'Crawler' url: 'https://daspeed.io/' @@ -3854,7 +3857,7 @@ name: 'DAWAP SARL' url: 'https://dawap.fr/' -- regex: 'StractBot(?:/([\d+.]+))?' +- regex: 'StractBot' name: 'Stract' category: 'Crawler' url: 'https://stract.com/webmasters' @@ -3862,7 +3865,7 @@ name: 'Stract' url: 'https://github.com/StractOrg/stract/' -- regex: 'GeedoBot(?:/([\d+.]+))?' +- regex: 'GeedoBot' name: 'GeedoBot' category: 'Crawler' url: 'https://geedo.com/bot/' @@ -3872,7 +3875,7 @@ category: 'Crawler' url: 'https://geedo.com/product-search/' -- regex: 'BackupLand(?:/([\d+.]+))?' +- regex: 'BackupLand' name: 'BackupLand' category: 'Crawler' url: 'https://go.backupland.com/' @@ -3880,7 +3883,7 @@ name: 'ООО «КВАРТА»' url: 'https://go.backupland.com/' -- regex: 'Konturbot(?:/([\d+.]+))?' +- regex: 'Konturbot' name: 'Konturbot' category: 'Crawler' url: 'https://kontur.ru/' @@ -3896,17 +3899,17 @@ name: 'ООО «МОДЕСКО»' url: 'https://www.modesco.ru/' -- regex: 'LetsearchBot(?:/([\d+.]+))?' +- regex: 'LetsearchBot' name: 'LetSearch' category: 'Crawler' url: 'https://letsearch.ru/bots' -- regex: 'Example3(?:/([\d+.]+))?' +- regex: 'Example3' name: 'Example3' category: 'Crawler' url: 'https://www.example3.com/' -- regex: 'StatOnlineRuBot(?:/([\d+.]+))?' +- regex: 'StatOnlineRuBot' name: 'StatOnline.ru' category: 'Crawler' url: 'https://statonline.ru/' @@ -3990,7 +3993,7 @@ name: 'Anthropic, PBC' url: 'https://www.anthropic.com/' -- regex: 'NetpeakCheckerBot/[\d.]+' +- regex: 'NetpeakCheckerBot' name: 'Netpeak Checker' category: 'Crawler' url: 'https://netpeaksoftware.com/checker' @@ -3998,7 +4001,7 @@ name: 'Netpeak LTD' url: 'https://netpeaksoftware.com/' -- regex: 'SandobaCrawler/[\d.]+' +- regex: 'SandobaCrawler' name: 'Sandoba//Crawler' category: 'Crawler' url: 'https://www.sandoba.com/en/crawler/' @@ -4014,7 +4017,7 @@ name: 'Sirdata SAS' url: 'https://www.sirdata.com/' -- regex: 'CheckMarkNetwork/[\d.]+' +- regex: 'CheckMarkNetwork' name: 'CheckMark Network' category: 'Crawler' url: 'https://www.checkmarknetwork.com/spider.html/' @@ -4030,7 +4033,7 @@ name: 'Cohere, Inc.' url: 'https://cohere.com/' -- regex: 'PerplexityBot/[\d.]+' +- regex: 'PerplexityBot' name: 'PerplexityBot' category: 'Crawler' url: 'https://docs.perplexity.ai/docs/perplexitybot' @@ -4054,12 +4057,12 @@ name: 'Metadot, Corp.' url: 'https://www.metadot.com/' -- regex: 'Ruby, Twurly v[\d.]+' +- regex: 'Ruby, Twurly v' name: 'Twurly' category: 'Crawler' url: 'https://twurly.org/' -- regex: 'Mixnode(?:(?:Cache)?/[\d.]+)?' +- regex: 'Mixnode(?:Cache)?' name: 'Mixnode' category: 'Crawler' url: 'https://www.mixnode.com/' @@ -4067,11 +4070,11 @@ name: 'Mixnode Technologies, Inc.' url: 'https://www.mixnode.com/' -- regex: 'CSSCheck/[\d.]+' +- regex: 'CSSCheck' name: 'CSSCheck' category: 'Validator' -- regex: 'MicrosoftPreview/[\d.]+' +- regex: 'MicrosoftPreview' name: 'Microsoft Preview' category: 'Service Agent' url: 'https://www.bing.com/webmasters/help/which-crawlers-does-bing-use-8c184ec0' @@ -4087,7 +4090,7 @@ name: 'Chronicle Security Ireland Limited' url: 'https://chronicle.security/' -- regex: 'TinEye/[\d.]+' +- regex: 'TinEye' name: 'TinEye' category: 'Crawler' url: 'https://tineye.com/' @@ -4119,7 +4122,7 @@ name: 'AdScanner d.o.o' url: 'https://www.alleyesonscreens.com/' -- regex: 'online-webceo-bot/[\d.]+' +- regex: 'online-webceo-bot' name: 'WebCEO' category: 'Crawler' url: 'https://www.webceo.com/' @@ -4140,7 +4143,7 @@ name: 'Vistex LTD' url: 'https://www.htmlyse.com/' -- regex: 'TrendsmapResolver/[\d.]+' +- regex: 'TrendsmapResolver' name: 'Trendsmap' category: 'Crawler' url: 'https://www.trendsmap.com/' @@ -4148,7 +4151,7 @@ name: 'Trendsmap Pty Ltd' url: 'https://www.trendsmap.com/' -- regex: 'Shareaholic(?:bot)?/[\d.]+' +- regex: 'Shareaholic(?:bot)?' name: 'Steve Bot' category: 'Crawler' url: 'https://www.shareaholic.com/steve' @@ -4182,7 +4185,7 @@ category: 'Service Agent' url: 'https://www.whatsmyip.org/ua/' -- regex: 'SenutoBot/[\d.]+' +- regex: 'SenutoBot' name: 'Senuto' category: 'Crawler' url: 'https://www.senuto.com/' @@ -4198,7 +4201,7 @@ name: 'Doly Horjun HJ' url: 'https://gozle.com.tm/' -- regex: 'Quantcastbot/[\d.]+' +- regex: 'Quantcastbot' name: 'Quantcast' category: 'Crawler' url: 'https://www.quantcast.com/bot/' @@ -4238,7 +4241,7 @@ name: 'Barracuda Networks, Inc.' url: 'https://www.barracudanetworks.com/' -- regex: 'RuxitSynthetic/[\d.]+' +- regex: 'RuxitSynthetic' name: 'RuxitSynthetic' category: 'Site Monitor' url: 'https://community.dynatrace.com/t5/Troubleshooting/Basic-Commands-for-Synthetic/ta-p/198164' @@ -4246,7 +4249,7 @@ name: 'Dynatrace LLC' url: 'https://www.dynatrace.com/' -- regex: 'DynatraceSynthetic/[\d.]+' +- regex: 'DynatraceSynthetic' name: 'DynatraceSynthetic' category: 'Site Monitor' url: 'https://community.dynatrace.com/t5/Troubleshooting/Basic-Commands-for-Synthetic/ta-p/198164' @@ -4262,7 +4265,7 @@ name: 'Sitebulb Limited' url: 'https://sitebulb.com/' -- regex: 'Monsidobot/[\d.]+' +- regex: 'Monsidobot' name: 'Monsidobot' category: 'Crawler' url: 'https://monsido.com/bot-html' @@ -4291,7 +4294,7 @@ category: 'Service Agent' url: 'https://www.google.com/script/start/' -- regex: 'SiteOne-Crawler/[\d.]+' +- regex: 'SiteOne-Crawler' name: 'SiteOne Crawler' category: 'Crawler' url: 'https://crawler.siteone.io/bot/' @@ -4315,7 +4318,7 @@ name: 'Axeman Technology Solutions LLP' url: 'https://axemantech.com/' -- regex: 'Paqlebot/[\d.]+' +- regex: 'Paqlebot' name: 'Paqlebot' category: 'Crawler' url: 'https://www.paqle.dk/about/paqlebot' @@ -4333,7 +4336,7 @@ category: 'Crawler' url: 'https://github.com/matrix-org/synapse' -- regex: 'OSZKbot/[\d.]+' +- regex: 'OSZKbot' name: 'OSZKbot' category: 'Crawler' url: 'http://mekosztaly.oszk.hu/mia/' @@ -4349,7 +4352,7 @@ name: 'SEO Cube S.r.l.' url: 'https://www.seocube.it/' -- regex: 'RavenCrawler/[\d.]+' +- regex: 'RavenCrawler' name: 'RavenCrawler' category: 'Crawler' url: 'https://raventools.com/site-auditor/' @@ -4365,7 +4368,7 @@ name: 'Kadolijst' url: 'https://www.kadolijst.nl/' -- regex: 'Dubbotbot/[\d.]+' +- regex: 'Dubbotbot' name: 'Dubbotbot' category: 'Crawler' url: 'https://help.dubbot.com/en/articles/6746594-example-custom-user-agent' @@ -4373,7 +4376,7 @@ name: 'DubBot' url: 'https://dubbot.com/' -- regex: 'Swiftbot/[\d.]+' +- regex: 'Swiftbot' name: 'Swiftbot' category: 'Crawler' url: 'https://swiftype.com/swiftbot' @@ -4397,15 +4400,15 @@ name: 'Cisco Systems, Inc.' url: 'https://www.cisco.com/' -- regex: 'OmtrBot/[\d.]+' +- regex: 'OmtrBot' name: 'OmtrBot' category: 'Site Monitor' -- regex: 'WebMon/[\d.]+' +- regex: 'WebMon' name: 'WebMon' category: 'Site Monitor' -- regex: 'AdsTxtCrawlerTP/[\d.]+' +- regex: 'AdsTxtCrawlerTP' name: 'AdsTxtCrawlerTP' category: 'Crawler' @@ -4425,7 +4428,7 @@ name: 'Clickagy, LLC' url: 'https://www.clickagy.com/' -- regex: 'kiwitcms-gitops/[\d.]+' +- regex: 'kiwitcms-gitops' name: 'Kiwi TCMS GitOps' category: 'Service Agent' url: 'https://kiwitcms.org' @@ -4483,7 +4486,7 @@ name: 'Probely - Soluções de Cibersegurança, S.A.' url: 'https://probely.com/' -- regex: 'Uptimia(?:/[\d.]+)?' +- regex: 'Uptimia' name: 'Uptimia' category: 'Site Monitor' url: 'https://www.uptimia.com/' @@ -4491,7 +4494,7 @@ name: 'JJ Online GmbH' url: 'https://www.uptimia.com/' -- regex: '2GDPR/[\d.]+' +- regex: '2GDPR' name: '2GDPR' category: 'Service Agent' url: 'https://2gdpr.com/tos' @@ -4515,7 +4518,7 @@ name: 'CheckHost' url: 'https://check-host.net/' -- regex: 'LAC_IAHarvester/[\d.]+' +- regex: 'LAC_IAHarvester' name: 'LAC IA Harvester' category: 'Crawler' url: 'https://library-archives.canada.ca/eng/services/government-canada/web-social-media-preservation-program/Pages/web-archive.aspx' @@ -4523,7 +4526,7 @@ name: 'Library and Archives Canada' url: 'https://library-archives.canada.ca/' -- regex: 'InsytfulBot/[\d.]+' +- regex: 'InsytfulBot' name: 'InsytfulBot' category: 'Crawler' url: 'https://www.insytful.com/' @@ -4539,7 +4542,7 @@ name: 'Statista, Inc.' url: 'https://www.statista.com/' -- regex: 'SubstackContentFetch/[\d.]+' +- regex: 'SubstackContentFetch' name: 'Substack Content Fetch' category: 'Crawler' url: 'https://substack.com/' @@ -4571,7 +4574,7 @@ name: 'Tenable, Inc.' url: 'https://www.tenable.com/' -- regex: 'Castopod/[\d.]+' +- regex: 'Castopod' name: 'Castopod' category: 'Crawler' url: 'https://www.castopod.org/' @@ -4584,7 +4587,7 @@ name: 'Elasticsearch B.V.' url: 'https://www.elastic.co/' -- regex: 'WDG_Validator/[\d.]+' +- regex: 'WDG_Validator' name: 'WDG HTML Validator' category: 'Validator' url: 'http://www.htmlhelp.com/tools/validator/' @@ -4594,7 +4597,7 @@ category: 'Crawler' url: 'https://web.archive.org/web/20180910002802/http://www.aegis.network/' -- regex: 'CrawlyProjectCrawler/[\d.]+' +- regex: 'CrawlyProjectCrawler' name: 'Crawly Project' category: 'Crawler' url: 'https://web.archive.org/web/20240326141952/https://crawlyproject.digitaldragon.dev/' @@ -4609,7 +4612,7 @@ category: 'Security Checker' url: 'https://github.com/openeasm/punkmap' -- regex: 'GenomeCrawlerd/[\d.]+' +- regex: 'GenomeCrawlerd' name: 'Deepfield Genome' category: 'Crawler' url: 'https://www.nokia.com/networks/ip-networks/deepfield/genome/' @@ -4617,12 +4620,12 @@ name: 'Nokia Corporation' url: 'https://www.nokia.com/' -- regex: 'Gaisbot/[\d.]+' +- regex: 'Gaisbot' name: 'Gaisbot' category: 'Crawler' url: 'https://web.archive.org/web/20090604121511/https://gais.cs.ccu.edu.tw/robot.php' -- regex: 'FAST-WebCrawler/[\d.]+' +- regex: 'FAST-WebCrawler' name: 'AlltheWeb' category: 'Crawler' url: 'https://web.archive.org/web/20041020050801/http://www.alltheweb.com/help/webmaster/crawler' @@ -4632,7 +4635,7 @@ category: 'Security Checker' url: 'https://ducks.party/' -- regex: 'DepSpid/[\d.]+' +- regex: 'DepSpid' name: 'DepSpid' category: 'Crawler' url: 'https://web.archive.org/web/20080321224033/http://about.depspid.net/' @@ -4743,7 +4746,7 @@ name: 'Marginalia' url: 'https://www.marginalia.nu/' -- regex: 'vu-server-health-scanner/[\d.]+' +- regex: 'vu-server-health-scanner' name: 'VU Server Health Scanner' category: 'Security Checker' url: 'https://130.37.198.75/index.html' @@ -4944,14 +4947,6 @@ name: 'University of Passau' url: 'https://www.uni-passau.de/en/' -- regex: 'Chatwork LinkPreview' - name: 'Chatwork LinkPreview' - category: 'Service Agent' - url: 'https://go.chatwork.com/en/' - producer: - name: 'kubell Co., Ltd.' - url: 'https://www.kubell.com/en/' - - regex: 'WPMU DEV' name: 'WPMU DEV' category: 'Crawler'