Skip to content

Commit

Permalink
Fix issue Crawlers broken due to page structure change
Browse files Browse the repository at this point in the history
- Tumblr changed the page structure which broke the JSON extraction.
  • Loading branch information
thomas694 committed Apr 14, 2024
1 parent 85637ec commit 8c51e23
Show file tree
Hide file tree
Showing 4 changed files with 10 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ namespace TumblThree.Applications.Crawler
public class TumblrBlogCrawler : AbstractTumblrCrawler, ICrawler, IDisposable
{
private static readonly Regex extractJsonFromPage = new Regex("window\\['___INITIAL_STATE___'] = ({.*});");
private static readonly Regex extractJsonFromPage2 = new Regex("id=\"___INITIAL_STATE___\">\\s*?({.*})\\s*?</script>", RegexOptions.Singleline);

private readonly IDownloader downloader;
private readonly ITumblrToTextParser<Post> tumblrJsonParser;
Expand Down Expand Up @@ -321,6 +322,7 @@ private async Task<ulong> GetHighestPostIdCoreAsync()
if (document.Contains("___INITIAL_STATE___"))
{
var extracted = extractJsonFromPage.Match(document).Groups[1].Value;
if (string.IsNullOrEmpty(extracted)) extracted = extractJsonFromPage2.Match(document).Groups[1].Value;
dynamic obj = JsonConvert.DeserializeObject<ExpandoObject>(extracted);
pinnedId = obj?.PeeprRoute?.initialTimeline?.objects?[0]?.id ?? "";
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ namespace TumblThree.Applications.Crawler
public class TumblrLikedByCrawler : AbstractTumblrCrawler, ICrawler, IDisposable
{
private static readonly Regex extractJsonFromLikes = new Regex("window\\['___INITIAL_STATE___'\\] = (.*);[\\s]*?</script>", RegexOptions.Singleline);
private static readonly Regex extractJsonFromLikes2 = new Regex("id=\"___INITIAL_STATE___\">\\s*?({.*})\\s*?</script>", RegexOptions.Singleline);

private readonly IDownloader downloader;
private readonly ITumblrToTextParser<Post> tumblrJsonParser;
Expand Down Expand Up @@ -340,6 +341,7 @@ private bool PostWithinTimespan(DataModels.TumblrSearchJson.Data post)
private static List<DataModels.TumblrSearchJson.Data> ExtractPosts(string document)
{
var extracted = extractJsonFromLikes.Match(document).Groups[1].Value;
if (string.IsNullOrEmpty(extracted)) extracted = extractJsonFromLikes2.Match(document).Groups[1].Value;
if (string.IsNullOrEmpty(extracted))
{
Logger.Verbose("TumblrLikedByCrawler:ExtractPosts: data not found inside: \n{0}", document);
Expand Down Expand Up @@ -685,6 +687,7 @@ private async Task<bool> CheckIfLoggedInAsync()
if (document.Contains("___INITIAL_STATE___"))
{
var extracted = extractJsonFromLikes.Match(document).Groups[1].Value;
if (string.IsNullOrEmpty(extracted)) extracted = extractJsonFromLikes2.Match(document).Groups[1].Value;
if (string.IsNullOrEmpty(extracted))
{
Logger.Verbose("TumblrLikedByCrawler:CheckIfLoggedInAsync: data not found inside: \n{0}", document);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ namespace TumblThree.Applications.Crawler
public class TumblrSearchCrawler : AbstractTumblrCrawler, ICrawler, IDisposable
{
private static readonly Regex extractJsonFromSearch = new Regex("window\\['___INITIAL_STATE___'\\] = (.*);");
private static readonly Regex extractJsonFromSearch2 = new Regex("id=\"___INITIAL_STATE___\">\\s*?({.*})\\s*?</script>", RegexOptions.Singleline);

private readonly IShellService shellService;
private readonly IDownloader downloader;
Expand Down Expand Up @@ -117,6 +118,7 @@ private async Task CrawlPageAsync()
{
string document = await GetSearchPageAsync();
string json = extractJsonFromSearch.Match(document).Groups[1].Value;
if (string.IsNullOrEmpty(json)) json = extractJsonFromSearch2.Match(document).Groups[1].Value;
dynamic result = JsonConvert.DeserializeObject<ExpandoObject>(json, new ExpandoObjectConverter());
string nextUrl = "";
string bearerToken = "";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ namespace TumblThree.Applications.Crawler
public class TumblrTagSearchCrawler : AbstractTumblrCrawler, ICrawler, IDisposable
{
private static readonly Regex extractJsonFromSearch = new Regex("window\\['___INITIAL_STATE___'\\] = (.*);");
private static readonly Regex extractJsonFromSearch2 = new Regex("id=\"___INITIAL_STATE___\">\\s*?({.*})\\s*?</script>", RegexOptions.Singleline);

private readonly IDownloader downloader;
private readonly IPostQueue<CrawlerData<Datum>> jsonQueue;
Expand Down Expand Up @@ -119,12 +120,14 @@ private async Task CrawlPageAsync()
{
string document = await GetTaggedSearchPageAsync();
string json = extractJsonFromSearch.Match(document).Groups[1].Value;
if (string.IsNullOrEmpty(json)) json = extractJsonFromSearch2.Match(document).Groups[1].Value;
TagSearch result = ConvertJsonToClass<TagSearch>(json);

if (result.Tagged.ShouldRedirect)
{
document = await GetTaggedSearchPageAsync(true);
json = extractJsonFromSearch.Match(document).Groups[1].Value;
if (string.IsNullOrEmpty(json)) json = extractJsonFromSearch2.Match(document).Groups[1].Value;
result = ConvertJsonToClass<TagSearch>(json);
}

Expand Down

0 comments on commit 8c51e23

Please sign in to comment.