Skip to content

Commit

Permalink
Fix issue #564 Problems in Likes crawler
Browse files Browse the repository at this point in the history
  • Loading branch information
thomas694 committed Sep 1, 2024
1 parent 42e3fed commit d5d57ac
Showing 1 changed file with 16 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,16 @@ private async Task CrawlPageAsync(int crawlerNumber)
return;
}

if (isLikesUrl)
{
var posts = ExtractPosts(document);
await DownloadPage(posts);
}
else
{
await AddUrlsToDownloadListAsync(document);
}

pagination = ExtractNextPageLink(document);
pageNumber++;
var notWithinTimespan = !CheckIfWithinTimespan(pagination);
Expand All @@ -215,16 +225,6 @@ private async Task CrawlPageAsync(int crawlerNumber)
}
nextPage.Add(Blog.Url + (isLikesUrl ? "?before=" : "/page/" + pageNumber + "/") + pagination);

if (isLikesUrl)
{
var posts = ExtractPosts(document);
await DownloadPage(posts);
}
else
{
await AddUrlsToDownloadListAsync(document);
}

Interlocked.Increment(ref numberOfPagesCrawled);
UpdateProgressQueueInformation(Resources.ProgressGetUrlShort, numberOfPagesCrawled);
if (notWithinTimespan)
Expand Down Expand Up @@ -537,7 +537,7 @@ private void DownloadMedia(DataModels.TumblrSearchJson.Content content, Post dat
if (content.Provider == "tumblr" || url.Contains("tumblr.com") || Blog.RegExVideos)
{
string thumbnailUrl = content.Poster[0].Url;
AddToDownloadList(new PhotoPost(thumbnailUrl, data.Id, data.UnixTimestamp.ToString(), BuildFileName(thumbnailUrl, data, index)));
AddToDownloadList(new PhotoPost(thumbnailUrl, thumbnailUrl, data.Id, data.UnixTimestamp.ToString(), BuildFileName(thumbnailUrl, data, index)));
}
}
// can only download preview image for non-tumblr (embedded) video posts
Expand All @@ -556,12 +556,13 @@ private void DownloadMedia(DataModels.TumblrSearchJson.Content content, Post dat
{
if (Blog.DownloadPhoto)
{
var postedUrl = url;
if (url.Contains("tumblr.com/"))
{
url = RetrieveOriginalImageUrl(url, 2000, 3000, false);
url = CheckPnjUrl(url);
}
AddToDownloadList(new PhotoPost(url, data.Id, data.UnixTimestamp.ToString(), BuildFileName(url, data, index)));
AddToDownloadList(new PhotoPost(url, postedUrl, data.Id, data.UnixTimestamp.ToString(), BuildFileName(url, data, index)));
}
}
}
Expand Down Expand Up @@ -740,20 +741,14 @@ private static long ExtractNextPageLink(string document)
// <a id="next_page_link" href="/liked/by/wallpaperfx/page/5/1457139681" class="next button chrome blue">Next</a></div></div>

const string htmlPagination = "(id=\"next_page_link\" href=\"[A-Za-z0-9_/:.-]+/([0-9]+)/([A-Za-z0-9]+))\"";
const string jsonPagination = "&before=([0-9]*)";
const string jsonPagination2 = "\\?before=([0-9]*)";
const string jsonPagination = @"(&|\\?|\\u0026)before=([0-9]*)";

_ = long.TryParse(Regex.Match(document, htmlPagination).Groups[3].Value, out var unixTime);

if (unixTime == 0)
{
var r = Regex.Match(document, jsonPagination);
_ = long.TryParse(r.Groups[1].Value, out unixTime);

if (unixTime == 0)
{
_ = long.TryParse(Regex.Match(document, jsonPagination2).Groups[1].Value, out unixTime);
}
var r = Regex.Matches(document, jsonPagination);
_ = long.TryParse(r[r.Count-1].Groups[2].Value, out unixTime);
}

return unixTime;
Expand Down

0 comments on commit d5d57ac

Please sign in to comment.