-
Notifications
You must be signed in to change notification settings - Fork 49
Crawling a site for all external links
Alexander Nyquist edited this page Feb 25, 2014
·
1 revision
The following snippet crawls a specific site for all outbound links. Note the ExcludeHostsExcept-filter, or all outbound links will also be crawled.
var crawler = new Crawler
{
ExcludeFilters = new IExcludeFilter[]
{
new ExcludeHostsExcept(new[] { "nyqui.st" }),
new ExcludeImagesFilter(),
new ExcludeTrackbacks(),
new ExcludeMailTo(),
new ExcludeJavaScript(),
new ExcludeAnchors(),
}
};
crawler.OnCompleted += () =>
{
Console.WriteLine("[Main] Crawl completed!");
Environment.Exit(0);
};
crawler.OnPageDownloaded += page =>
{
Console.WriteLine("[Main] Downloaded page {0}", page.Url);
// Write external links
foreach (var link in page.Links)
{
if (link.TargetUrl.Host != page.Url.Host)
{
Console.WriteLine("Found outbound link from {0} to {1}", page.Url, link.TargetUrl);
}
}
};
crawler.Enqueue(new Uri("http://nyqui.st"));
crawler.Start();
Console.WriteLine("[Main] Crawler started.");
Console.WriteLine("[Main] Press [enter] to abort.");
Console.ReadLine();