Fix issue Crawlers broken due to page structure change

- Tumblr changed the page structure which broke the JSON extraction.
TumblThreeApp · Apr 14, 2024 · 8c51e23 · 8c51e23
1 parent 85637ec
commit 8c51e23
Show file tree

Hide file tree

Showing 4 changed files with 10 additions and 0 deletions.
diff --git a/src/TumblThree/TumblThree.Applications/Crawler/TumblrBlogCrawler.cs b/src/TumblThree/TumblThree.Applications/Crawler/TumblrBlogCrawler.cs
@@ -31,6 +31,7 @@ namespace TumblThree.Applications.Crawler
     public class TumblrBlogCrawler : AbstractTumblrCrawler, ICrawler, IDisposable
     {
         private static readonly Regex extractJsonFromPage = new Regex("window\\['___INITIAL_STATE___'] = ({.*});");
+        private static readonly Regex extractJsonFromPage2 = new Regex("id=\"___INITIAL_STATE___\">\\s*?({.*})\\s*?</script>", RegexOptions.Singleline);
 
         private readonly IDownloader downloader;
         private readonly ITumblrToTextParser<Post> tumblrJsonParser;
@@ -321,6 +322,7 @@ private async Task<ulong> GetHighestPostIdCoreAsync()
             if (document.Contains("___INITIAL_STATE___"))
             {
                 var extracted = extractJsonFromPage.Match(document).Groups[1].Value;
+                if (string.IsNullOrEmpty(extracted)) extracted = extractJsonFromPage2.Match(document).Groups[1].Value;
                 dynamic obj = JsonConvert.DeserializeObject<ExpandoObject>(extracted);
                 pinnedId = obj?.PeeprRoute?.initialTimeline?.objects?[0]?.id ?? "";
             }

diff --git a/src/TumblThree/TumblThree.Applications/Crawler/TumblrLikedByCrawler.cs b/src/TumblThree/TumblThree.Applications/Crawler/TumblrLikedByCrawler.cs
@@ -31,6 +31,7 @@ namespace TumblThree.Applications.Crawler
     public class TumblrLikedByCrawler : AbstractTumblrCrawler, ICrawler, IDisposable
     {
         private static readonly Regex extractJsonFromLikes = new Regex("window\\['___INITIAL_STATE___'\\] = (.*);[\\s]*?</script>", RegexOptions.Singleline);
+        private static readonly Regex extractJsonFromLikes2 = new Regex("id=\"___INITIAL_STATE___\">\\s*?({.*})\\s*?</script>", RegexOptions.Singleline);
 
         private readonly IDownloader downloader;
         private readonly ITumblrToTextParser<Post> tumblrJsonParser;
@@ -340,6 +341,7 @@ private bool PostWithinTimespan(DataModels.TumblrSearchJson.Data post)
         private static List<DataModels.TumblrSearchJson.Data> ExtractPosts(string document)
         {
             var extracted = extractJsonFromLikes.Match(document).Groups[1].Value;
+            if (string.IsNullOrEmpty(extracted)) extracted = extractJsonFromLikes2.Match(document).Groups[1].Value;
             if (string.IsNullOrEmpty(extracted))
             {
                 Logger.Verbose("TumblrLikedByCrawler:ExtractPosts: data not found inside: \n{0}", document);
@@ -685,6 +687,7 @@ private async Task<bool> CheckIfLoggedInAsync()
                 if (document.Contains("___INITIAL_STATE___"))
                 {
                     var extracted = extractJsonFromLikes.Match(document).Groups[1].Value;
+                    if (string.IsNullOrEmpty(extracted)) extracted = extractJsonFromLikes2.Match(document).Groups[1].Value;
                     if (string.IsNullOrEmpty(extracted))
                     {
                         Logger.Verbose("TumblrLikedByCrawler:CheckIfLoggedInAsync: data not found inside: \n{0}", document);

diff --git a/src/TumblThree/TumblThree.Applications/Crawler/TumblrSearchCrawler.cs b/src/TumblThree/TumblThree.Applications/Crawler/TumblrSearchCrawler.cs
@@ -31,6 +31,7 @@ namespace TumblThree.Applications.Crawler
     public class TumblrSearchCrawler : AbstractTumblrCrawler, ICrawler, IDisposable
     {
         private static readonly Regex extractJsonFromSearch = new Regex("window\\['___INITIAL_STATE___'\\] = (.*);");
+        private static readonly Regex extractJsonFromSearch2 = new Regex("id=\"___INITIAL_STATE___\">\\s*?({.*})\\s*?</script>", RegexOptions.Singleline);
 
         private readonly IShellService shellService;
         private readonly IDownloader downloader;
@@ -117,6 +118,7 @@ private async Task CrawlPageAsync()
             {
                 string document = await GetSearchPageAsync();
                 string json = extractJsonFromSearch.Match(document).Groups[1].Value;
+                if (string.IsNullOrEmpty(json)) json = extractJsonFromSearch2.Match(document).Groups[1].Value;
                 dynamic result = JsonConvert.DeserializeObject<ExpandoObject>(json, new ExpandoObjectConverter());
                 string nextUrl = "";
                 string bearerToken = "";

diff --git a/src/TumblThree/TumblThree.Applications/Crawler/TumblrTagSearchCrawler.cs b/src/TumblThree/TumblThree.Applications/Crawler/TumblrTagSearchCrawler.cs
@@ -29,6 +29,7 @@ namespace TumblThree.Applications.Crawler
     public class TumblrTagSearchCrawler : AbstractTumblrCrawler, ICrawler, IDisposable
     {
         private static readonly Regex extractJsonFromSearch = new Regex("window\\['___INITIAL_STATE___'\\] = (.*);");
+        private static readonly Regex extractJsonFromSearch2 = new Regex("id=\"___INITIAL_STATE___\">\\s*?({.*})\\s*?</script>", RegexOptions.Singleline);
 
         private readonly IDownloader downloader;
         private readonly IPostQueue<CrawlerData<Datum>> jsonQueue;
@@ -119,12 +120,14 @@ private async Task CrawlPageAsync()
             {
                 string document = await GetTaggedSearchPageAsync();
                 string json = extractJsonFromSearch.Match(document).Groups[1].Value;
+                if (string.IsNullOrEmpty(json)) json = extractJsonFromSearch2.Match(document).Groups[1].Value;
                 TagSearch result = ConvertJsonToClass<TagSearch>(json);
 
                 if (result.Tagged.ShouldRedirect)
                 {
                     document = await GetTaggedSearchPageAsync(true);
                     json = extractJsonFromSearch.Match(document).Groups[1].Value;
+                    if (string.IsNullOrEmpty(json)) json = extractJsonFromSearch2.Match(document).Groups[1].Value;
                     result = ConvertJsonToClass<TagSearch>(json);
                 }