From 6ff645cc2c8a1c2be474860b526df3ba909e575f Mon Sep 17 00:00:00 2001 From: Aaron Stannard Date: Fri, 3 Jan 2025 15:59:18 -0600 Subject: [PATCH] Added real CLI (#4) * completed first pass at the CLI itself * added Uri printing * escape links in markdown --- src/Directory.Packages.props | 3 +- src/LinkValidator.Tests/RelativeUriSpecs.cs | 46 +++++++++++++ src/LinkValidator/Actors/IndexerActor.cs | 16 ++++- src/LinkValidator/Actors/UriTypes.cs | 4 ++ src/LinkValidator/LinkValidator.csproj | 1 + src/LinkValidator/Program.cs | 72 ++++++++++++++++++++- src/LinkValidator/Util/DiffHelper.cs | 47 ++++++++++++++ src/LinkValidator/Util/MarkdownHelper.cs | 27 ++++++++ 8 files changed, 209 insertions(+), 7 deletions(-) create mode 100644 src/LinkValidator.Tests/RelativeUriSpecs.cs create mode 100644 src/LinkValidator/Util/DiffHelper.cs create mode 100644 src/LinkValidator/Util/MarkdownHelper.cs diff --git a/src/Directory.Packages.props b/src/Directory.Packages.props index b756fd0..311b398 100644 --- a/src/Directory.Packages.props +++ b/src/Directory.Packages.props @@ -5,6 +5,7 @@ + @@ -13,6 +14,6 @@ - + \ No newline at end of file diff --git a/src/LinkValidator.Tests/RelativeUriSpecs.cs b/src/LinkValidator.Tests/RelativeUriSpecs.cs new file mode 100644 index 0000000..0d5667d --- /dev/null +++ b/src/LinkValidator.Tests/RelativeUriSpecs.cs @@ -0,0 +1,46 @@ +using FluentAssertions; +using LinkValidator.Actors; + +namespace LinkValidator.Tests; + +public class RelativeUriSpecs +{ + public RelativeUri Uri1 { get; } = new(new Uri("/path", UriKind.Relative)); + + [Fact] + public void RelativeUri_should_throw_when_not_relative() + { + // Arrange + var uri = new Uri("http://example.com", UriKind.RelativeOrAbsolute); + + // Act + Action act = () => new RelativeUri(uri); + + // Assert + act.Should().Throw(); + } + + [Fact] + public void RelativeUri_should_equal_copy_of_itself() + { + // Arrange + var uri2 = new RelativeUri(new Uri(Uri1.Value.ToString(), UriKind.Relative)); + + // Assert + uri2.Should().Be(Uri1); + Uri1.GetHashCode().Should().Be(uri2.GetHashCode()); + } + + [Fact] + public void RelativeUri_should_print_path() + { + // Arrange + var uri = new RelativeUri(new Uri("/path-to-file.html", UriKind.Relative)); + + // Act + var result = uri.ToString(); + + // Assert + result.Should().Be("/path-to-file.html"); + } +} \ No newline at end of file diff --git a/src/LinkValidator/Actors/IndexerActor.cs b/src/LinkValidator/Actors/IndexerActor.cs index 24348aa..c9c3464 100644 --- a/src/LinkValidator/Actors/IndexerActor.cs +++ b/src/LinkValidator/Actors/IndexerActor.cs @@ -1,7 +1,9 @@ -using System.Net; +using System.Collections.Immutable; +using System.Net; using Akka.Actor; using Akka.Event; using Akka.Routing; +using LinkValidator.Util; namespace LinkValidator.Actors; @@ -51,10 +53,11 @@ private ReportStatistics() {} private readonly ILoggingAdapter _log = Context.GetLogger(); private readonly CrawlConfiguration _crawlConfiguration; private IActorRef _crawlers = ActorRefs.Nobody; - - public IndexerActor(CrawlConfiguration crawlConfiguration) + private readonly TaskCompletionSource> _completionSource; + public IndexerActor(CrawlConfiguration crawlConfiguration, TaskCompletionSource> completionSource) { _crawlConfiguration = crawlConfiguration; + _completionSource = completionSource; } public Dictionary IndexedDocuments { get; } = new(); @@ -83,6 +86,13 @@ protected override void OnReceive(object message) if (IsCrawlComplete) { _log.Info("Crawl complete!"); + + var finalOutput = IndexedDocuments + .Where(x => x.Value.status == CrawlStatus.Visited) + .ToImmutableSortedDictionary(x => UriHelpers.ToRelativeUri(_crawlConfiguration.BaseUrl, x.Key).ToString(), x => x.Value.Item2 ?? HttpStatusCode.NotFound); + + _completionSource.SetResult(finalOutput); + Context.Stop(Self); } break; diff --git a/src/LinkValidator/Actors/UriTypes.cs b/src/LinkValidator/Actors/UriTypes.cs index 5fbb918..521924d 100644 --- a/src/LinkValidator/Actors/UriTypes.cs +++ b/src/LinkValidator/Actors/UriTypes.cs @@ -1,3 +1,5 @@ +using System.Text; + namespace LinkValidator.Actors; public record struct AbsoluteUri @@ -25,4 +27,6 @@ public RelativeUri(Uri value) } public Uri Value { get; } + + public override string ToString() => Value.ToString(); } \ No newline at end of file diff --git a/src/LinkValidator/LinkValidator.csproj b/src/LinkValidator/LinkValidator.csproj index daa27bb..d74b746 100644 --- a/src/LinkValidator/LinkValidator.csproj +++ b/src/LinkValidator/LinkValidator.csproj @@ -9,6 +9,7 @@ + diff --git a/src/LinkValidator/Program.cs b/src/LinkValidator/Program.cs index c2036bd..a039d85 100644 --- a/src/LinkValidator/Program.cs +++ b/src/LinkValidator/Program.cs @@ -1,9 +1,75 @@ -namespace LinkValidator; +using System.Collections.Immutable; +using System.CommandLine; +using System.Net; +using Akka.Actor; +using LinkValidator.Actors; +using static LinkValidator.Util.DiffHelper; +using static LinkValidator.Util.MarkdownHelper; + +namespace LinkValidator; class Program { - static void Main(string[] args) + public static async Task Main(string[] args) + { + var urlOption = new Option("--url", "The URL to crawl") { IsRequired = true }; + var outputOption = new Option("--output", "Optional output file path for the sitemap"); + var diffOption = new Option("--diff", "Previous sitemap file to compare against"); + var strictOption = new Option("--strict", () => false, + "Return error code if pages are missing or returning 400+ status codes"); + + var rootCommand = new RootCommand("Website crawler and sitemap generator") + { + urlOption, + outputOption, + diffOption, + strictOption + }; + + rootCommand.SetHandler(async (url, output, diff, strict) => + { + using var system = ActorSystem.Create("CrawlerSystem", "akka.loglevel = INFO"); + var absoluteUri = new AbsoluteUri(new Uri(url)); + var results = await CrawlWebsite(system, absoluteUri); + var markdown = GenerateMarkdown(absoluteUri, results); + + _ = system.Terminate(); + + if (output != null) + { + await File.WriteAllTextAsync(output, markdown); + } + else + { + Console.WriteLine(markdown); + } + + if (!string.IsNullOrEmpty(diff)) + { + var previousMarkdown = await File.ReadAllTextAsync(diff); + var (differences, hasErrors) = CompareSitemapsWithErrors(previousMarkdown, markdown); + foreach (var difference in differences) + { + Console.WriteLine(difference); + } + + if (strict && hasErrors) + { + Environment.Exit(1); + } + } + }, urlOption, outputOption, diffOption, strictOption); + + return await rootCommand.InvokeAsync(args); + } + + private static async Task> CrawlWebsite(ActorSystem system, AbsoluteUri url) { - Console.WriteLine("Hello, World!"); + var crawlSettings = new CrawlConfiguration(url, 10, TimeSpan.FromSeconds(5)); + var tcs = new TaskCompletionSource>(); + + var indexer = system.ActorOf(Props.Create(() => new IndexerActor(crawlSettings, tcs)), "indexer"); + indexer.Tell(IndexerActor.BeginIndexing.Instance); + return await tcs.Task; } } \ No newline at end of file diff --git a/src/LinkValidator/Util/DiffHelper.cs b/src/LinkValidator/Util/DiffHelper.cs new file mode 100644 index 0000000..987f393 --- /dev/null +++ b/src/LinkValidator/Util/DiffHelper.cs @@ -0,0 +1,47 @@ +using System.Text.RegularExpressions; + +namespace LinkValidator.Util; + +public static partial class DiffHelper +{ + public static (IReadOnlyList Differences, bool HasErrors) CompareSitemapsWithErrors(string previous, string current) + { + var differences = new List(); + var hasErrors = false; + + var previousLines = previous.Split('\n') + .Skip(2) + .Where(l => !string.IsNullOrWhiteSpace(l)) + .ToList(); + + var currentLines = current.Split('\n') + .Skip(2) + .Where(l => !string.IsNullOrWhiteSpace(l)) + .ToList(); + + // Check for missing pages + foreach (var line in previousLines.Except(currentLines)) + { + differences.Add($"Missing: {line}"); + hasErrors = true; + } + + // Check for new pages + foreach (var line in currentLines.Except(previousLines)) + { + differences.Add($"New: {line}"); + + // Check if new page has error status code + var statusCodeMatch = MyRegex().Match(line); + if (statusCodeMatch.Success && int.Parse(statusCodeMatch.Groups[1].Value) >= 400) + { + hasErrors = true; + } + } + + return (differences, hasErrors); + } + + [GeneratedRegex(@"\|\s*(\d{3})\s*\|")] + private static partial Regex MyRegex(); +} \ No newline at end of file diff --git a/src/LinkValidator/Util/MarkdownHelper.cs b/src/LinkValidator/Util/MarkdownHelper.cs new file mode 100644 index 0000000..d4b4588 --- /dev/null +++ b/src/LinkValidator/Util/MarkdownHelper.cs @@ -0,0 +1,27 @@ +using System.Collections.Immutable; +using System.Net; +using Grynwald.MarkdownGenerator; +using LinkValidator.Actors; + +namespace LinkValidator.Util; + +public static class MarkdownHelper +{ + public static string GenerateMarkdown(AbsoluteUri baseUri, ImmutableSortedDictionary results) + { + var document = new MdDocument(); + + // Add a header + document.Root.Add(new MdHeading(1, $"Sitemap for [{baseUri.Value.ToString()}]")); + var headerRow = new MdTableRow(new MdTextSpan("URL"), new MdTextSpan("StatusCode")); + var rows = results.Select(kvp => new MdTableRow(new MdCodeSpan(kvp.Key), new MdTextSpan(kvp.Value.ToString()))); + + // Add a table + document.Root.Add(new MdTable(headerRow, rows)); + + return document.ToString(new MdSerializationOptions() + { + TableStyle = MdTableStyle.GFM + }); + } +} \ No newline at end of file