diff --git a/src/TumblThree/TumblThree.Applications/Crawler/TumblrBlogCrawler.cs b/src/TumblThree/TumblThree.Applications/Crawler/TumblrBlogCrawler.cs index 4413553a..578752bc 100644 --- a/src/TumblThree/TumblThree.Applications/Crawler/TumblrBlogCrawler.cs +++ b/src/TumblThree/TumblThree.Applications/Crawler/TumblrBlogCrawler.cs @@ -31,6 +31,7 @@ namespace TumblThree.Applications.Crawler public class TumblrBlogCrawler : AbstractTumblrCrawler, ICrawler, IDisposable { private static readonly Regex extractJsonFromPage = new Regex("window\\['___INITIAL_STATE___'] = ({.*});"); + private static readonly Regex extractJsonFromPage2 = new Regex("id=\"___INITIAL_STATE___\">\\s*?({.*})\\s*?", RegexOptions.Singleline); private readonly IDownloader downloader; private readonly ITumblrToTextParser tumblrJsonParser; @@ -321,6 +322,7 @@ private async Task GetHighestPostIdCoreAsync() if (document.Contains("___INITIAL_STATE___")) { var extracted = extractJsonFromPage.Match(document).Groups[1].Value; + if (string.IsNullOrEmpty(extracted)) extracted = extractJsonFromPage2.Match(document).Groups[1].Value; dynamic obj = JsonConvert.DeserializeObject(extracted); pinnedId = obj?.PeeprRoute?.initialTimeline?.objects?[0]?.id ?? ""; } diff --git a/src/TumblThree/TumblThree.Applications/Crawler/TumblrLikedByCrawler.cs b/src/TumblThree/TumblThree.Applications/Crawler/TumblrLikedByCrawler.cs index 48aefcd2..7f815ffc 100644 --- a/src/TumblThree/TumblThree.Applications/Crawler/TumblrLikedByCrawler.cs +++ b/src/TumblThree/TumblThree.Applications/Crawler/TumblrLikedByCrawler.cs @@ -31,6 +31,7 @@ namespace TumblThree.Applications.Crawler public class TumblrLikedByCrawler : AbstractTumblrCrawler, ICrawler, IDisposable { private static readonly Regex extractJsonFromLikes = new Regex("window\\['___INITIAL_STATE___'\\] = (.*);[\\s]*?", RegexOptions.Singleline); + private static readonly Regex extractJsonFromLikes2 = new Regex("id=\"___INITIAL_STATE___\">\\s*?({.*})\\s*?", RegexOptions.Singleline); private readonly IDownloader downloader; private readonly ITumblrToTextParser tumblrJsonParser; @@ -340,6 +341,7 @@ private bool PostWithinTimespan(DataModels.TumblrSearchJson.Data post) private static List ExtractPosts(string document) { var extracted = extractJsonFromLikes.Match(document).Groups[1].Value; + if (string.IsNullOrEmpty(extracted)) extracted = extractJsonFromLikes2.Match(document).Groups[1].Value; if (string.IsNullOrEmpty(extracted)) { Logger.Verbose("TumblrLikedByCrawler:ExtractPosts: data not found inside: \n{0}", document); @@ -685,6 +687,7 @@ private async Task CheckIfLoggedInAsync() if (document.Contains("___INITIAL_STATE___")) { var extracted = extractJsonFromLikes.Match(document).Groups[1].Value; + if (string.IsNullOrEmpty(extracted)) extracted = extractJsonFromLikes2.Match(document).Groups[1].Value; if (string.IsNullOrEmpty(extracted)) { Logger.Verbose("TumblrLikedByCrawler:CheckIfLoggedInAsync: data not found inside: \n{0}", document); diff --git a/src/TumblThree/TumblThree.Applications/Crawler/TumblrSearchCrawler.cs b/src/TumblThree/TumblThree.Applications/Crawler/TumblrSearchCrawler.cs index b4d95517..a21ae96d 100644 --- a/src/TumblThree/TumblThree.Applications/Crawler/TumblrSearchCrawler.cs +++ b/src/TumblThree/TumblThree.Applications/Crawler/TumblrSearchCrawler.cs @@ -31,6 +31,7 @@ namespace TumblThree.Applications.Crawler public class TumblrSearchCrawler : AbstractTumblrCrawler, ICrawler, IDisposable { private static readonly Regex extractJsonFromSearch = new Regex("window\\['___INITIAL_STATE___'\\] = (.*);"); + private static readonly Regex extractJsonFromSearch2 = new Regex("id=\"___INITIAL_STATE___\">\\s*?({.*})\\s*?", RegexOptions.Singleline); private readonly IShellService shellService; private readonly IDownloader downloader; @@ -117,6 +118,7 @@ private async Task CrawlPageAsync() { string document = await GetSearchPageAsync(); string json = extractJsonFromSearch.Match(document).Groups[1].Value; + if (string.IsNullOrEmpty(json)) json = extractJsonFromSearch2.Match(document).Groups[1].Value; dynamic result = JsonConvert.DeserializeObject(json, new ExpandoObjectConverter()); string nextUrl = ""; string bearerToken = ""; diff --git a/src/TumblThree/TumblThree.Applications/Crawler/TumblrTagSearchCrawler.cs b/src/TumblThree/TumblThree.Applications/Crawler/TumblrTagSearchCrawler.cs index 8fcdb24f..faa050b3 100644 --- a/src/TumblThree/TumblThree.Applications/Crawler/TumblrTagSearchCrawler.cs +++ b/src/TumblThree/TumblThree.Applications/Crawler/TumblrTagSearchCrawler.cs @@ -29,6 +29,7 @@ namespace TumblThree.Applications.Crawler public class TumblrTagSearchCrawler : AbstractTumblrCrawler, ICrawler, IDisposable { private static readonly Regex extractJsonFromSearch = new Regex("window\\['___INITIAL_STATE___'\\] = (.*);"); + private static readonly Regex extractJsonFromSearch2 = new Regex("id=\"___INITIAL_STATE___\">\\s*?({.*})\\s*?", RegexOptions.Singleline); private readonly IDownloader downloader; private readonly IPostQueue> jsonQueue; @@ -119,12 +120,14 @@ private async Task CrawlPageAsync() { string document = await GetTaggedSearchPageAsync(); string json = extractJsonFromSearch.Match(document).Groups[1].Value; + if (string.IsNullOrEmpty(json)) json = extractJsonFromSearch2.Match(document).Groups[1].Value; TagSearch result = ConvertJsonToClass(json); if (result.Tagged.ShouldRedirect) { document = await GetTaggedSearchPageAsync(true); json = extractJsonFromSearch.Match(document).Groups[1].Value; + if (string.IsNullOrEmpty(json)) json = extractJsonFromSearch2.Match(document).Groups[1].Value; result = ConvertJsonToClass(json); }