From ec5e38997202470a228a7d884939aa0430553861 Mon Sep 17 00:00:00 2001
From: ayan1c2 <ayan1c2@users.noreply.github.com>
Date: Tue, 12 Nov 2024 15:26:09 +0000
Subject: [PATCH] deploy: a4acf78bdad5b34c32e236fa4e3efad564afc687

---
 .nojekyll                      |    0
 CLMS_doc_example.html          | 1251 ++++++++++++++++++++++++++++++++
 CLMS_filenamingconvention.html |  762 +++++++++++++++++++
 CheatSheet.html                |  800 ++++++++++++++++++++
 README.html                    |  292 ++++++++
 clms.html                      | 1227 +++++++++++++++++++++++++++++++
 guidelines.html                |  906 +++++++++++++++++++++++
 sitemap.xml                    |   39 +
 8 files changed, 5277 insertions(+)
 create mode 100644 .nojekyll
 create mode 100644 CLMS_doc_example.html
 create mode 100644 CLMS_filenamingconvention.html
 create mode 100644 CheatSheet.html
 create mode 100644 README.html
 create mode 100644 clms.html
 create mode 100644 guidelines.html
 create mode 100644 sitemap.xml
diff --git a/.nojekyll b/.nojekyll
new file mode 100644
index 0000000..e69de29
diff --git a/CLMS_doc_example.html b/CLMS_doc_example.html
new file mode 100644
index 0000000..210918e
--- /dev/null
+++ b/CLMS_doc_example.html
@@ -0,0 +1,1251 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.5.56">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+<meta name="author" content="Ayan Chatterjee, Department of DIGITAL, NILU">
+<meta name="dcterms.date" content="2024-10-30">
+<meta name="keywords" content="CLMS standards, web crawlers, AI training, information formatting">
+
+<title>Developing CLMS Standards for Generative AI Training and Web Crawlers Using Quarto Markdown and Sitemaps</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+/* CSS for citations */
+div.csl-bib-body { }
+div.csl-entry {
+  clear: both;
+  margin-bottom: 0em;
+}
+.hanging-indent div.csl-entry {
+  margin-left:2em;
+  text-indent:-2em;
+}
+div.csl-left-margin {
+  min-width:2em;
+  float:left;
+}
+div.csl-right-inline {
+  margin-left:2em;
+  padding-left:1em;
+}
+div.csl-indent {
+  margin-left: 2em;
+}</style>
+
+
+<script src="CLMS_doc_example_files/libs/clipboard/clipboard.min.js"></script>
+<script src="CLMS_doc_example_files/libs/quarto-html/quarto.js"></script>
+<script src="CLMS_doc_example_files/libs/quarto-html/popper.min.js"></script>
+<script src="CLMS_doc_example_files/libs/quarto-html/tippy.umd.min.js"></script>
+<script src="CLMS_doc_example_files/libs/quarto-html/anchor.min.js"></script>
+<link href="CLMS_doc_example_files/libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="CLMS_doc_example_files/libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="CLMS_doc_example_files/libs/bootstrap/bootstrap.min.js"></script>
+<link href="CLMS_doc_example_files/libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="CLMS_doc_example_files/libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+
+
+<link rel="stylesheet" href="../styles.css">
+</head>
+
+<body>
+
+<div id="quarto-content" class="page-columns page-rows-contents page-layout-article">
+<div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+  
+<div class="quarto-alternate-formats"><h2>Other Formats</h2><ul><li><a href="CLMS_doc_example.pdf"><i class="bi bi-file-pdf"></i>PDF</a></li><li><a href="CLMS_doc_example.docx"><i class="bi bi-file-word"></i>MS Word</a></li></ul></div></div>
+<main class="content" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default">
+<div class="quarto-title">
+<h1 class="title">Developing CLMS Standards for Generative AI Training and Web Crawlers Using Quarto Markdown and Sitemaps</h1>
+<p class="subtitle lead">Task 10.1: Information Provisioning for Generative Chatbots</p>
+</div>
+
+
+
+<div class="quarto-title-meta">
+
+    <div>
+    <div class="quarto-title-meta-heading">Author</div>
+    <div class="quarto-title-meta-contents">
+             <p>Ayan Chatterjee, Department of DIGITAL, NILU </p>
+          </div>
+  </div>
+    
+    <div>
+    <div class="quarto-title-meta-heading">Published</div>
+    <div class="quarto-title-meta-contents">
+      <p class="date">October 30, 2024</p>
+    </div>
+  </div>
+  
+    
+  </div>
+  
+
+<div>
+  <div class="keywords">
+    <div class="block-title">Keywords</div>
+    <p>CLMS standards, web crawlers, AI training, information formatting</p>
+  </div>
+</div>
+
+</header>
+
+
+<div style="font-family: 'Times New Roman', serif; text-align: justify;">
+<section id="abstract" class="level1">
+<h1>Abstract</h1>
+<p>Generative chatbots rely on large amounts of structured data to provide accurate, timely responses to user queries. By developing <strong>Copernicus Land Monitoring Service (CLMS)</strong> standards for information formatting and delivery using <strong>Quarto Markdown</strong> and <strong>sitemaps</strong>, we can ensure that the vast amounts of environmental data in CLMS are accessible to web crawlers and AI models. Using standardized structured content improves discoverability and discoverability of CLMS products and makes it easier for users to access relevant datasets through traditional search engines and generative chatbots.</p>
+<p>In addition, by providing clear guidelines for content formatting, cross-referencing, and sitemap management, this approach ensures that the CLMS data repository remains up-to-date and well-organized. This in turn supports the training of AI models to help users find exactly the CLMS products they need, whether through direct query or generative chatbot interaction.</p>
+</section>
+</div>
+<div style="font-family: 'Times New Roman', serif; text-align: justify;">
+<section id="introduction" class="level1">
+<h1>1. Introduction</h1>
+<section id="importance-of-copernicus-land-monitoring-service-clms" class="level2">
+<h2 class="anchored" data-anchor-id="importance-of-copernicus-land-monitoring-service-clms">1.1. Importance of Copernicus Land Monitoring Service (CLMS)</h2>
+<p>The Copernicus Land Monitoring Service (CLMS) is a critical component of the Copernicus Programme, which is the European Union’s Earth observation initiative <span class="citation" data-cites="CLMS"><a href="#ref-CLMS" role="doc-biblioref">[1]</a></span>. The service is responsible for providing timely and accurate land cover and land use data, along with a wide range of environmental variables related to land ecosystems. This data is essential for understanding and managing Europe’s environmental resources, supporting sustainable development, climate monitoring, and informed policy-making. The key areas where CLMS is vital include:</p>
+<ul>
+<li><p><strong>Environmental Monitoring</strong>: CLMS provides data on land cover, vegetation, soil, and water bodies, which are crucial for monitoring environmental changes such as deforestation, urban sprawl, and the health of ecosystems. This data supports conservation efforts and helps in tracking biodiversity and land degradation.</p></li>
+<li><p><strong>Sustainable Land Management</strong>: With the growing need for sustainable practices, CLMS delivers data that helps governments and organizations plan and manage land resources more effectively. It supports agriculture, forestry, water management, and urban planning, helping to mitigate the effects of climate change.</p></li>
+<li><p><strong>Climate Change Monitoring</strong>: CLMS plays a significant role in assessing the impact of climate change on European landscapes. It helps track changes in land use, vegetation, and land surface temperatures, which are important indicators of climate change impacts.</p></li>
+<li><p><strong>Disaster Management</strong>: CLMS data is used for emergency response and disaster management, especially in cases of floods, fires, and other natural disasters. The accurate and near-real-time data allows authorities to take preventive actions and make quick decisions during emergencies.</p></li>
+<li><p><strong>Policy Support and Decision-Making</strong>: The service supports EU environmental policies, including the Green Deal, Common Agricultural Policy (CAP), and the EU Biodiversity Strategy. The data provided by CLMS informs decision-makers at the European, national, and local levels, ensuring that policies are grounded in the latest environmental data.</p></li>
+</ul>
+</section>
+<section id="importance-of-clms-documentation-for-web-crawlers-enhancing-product-discoverability-and-findability" class="level2">
+<h2 class="anchored" data-anchor-id="importance-of-clms-documentation-for-web-crawlers-enhancing-product-discoverability-and-findability">1.2. Importance of CLMS Documentation for Web Crawlers: Enhancing Product Discoverability and Findability</h2>
+<p>The discoverability and findability of CLMS products on the web are crucial for ensuring that this valuable environmental data is accessible to a wide range of users, including researchers, policymakers, and environmental organizations. Making CLMS documentation available on the web for crawlers facilitates product discoverability by enabling search engines and AI-powered systems (like generative chatbots) to index, retrieve, and present relevant data to users. Here’s why ensuring that CLMS documents are available to web crawlers is essential:</p>
+<ul>
+<li><p><strong>Increased Accessibility for Diverse Users</strong>: CLMS products cater to a broad audience, including government agencies, NGOs, scientists, and the public. Properly formatted and exposed documentation allows these users to easily find and access data via search engines. Web crawlers can efficiently index CLMS products, simplifying the search for specific datasets without navigating complex databases.</p></li>
+<li><p><strong>Enhanced Search Engine Optimization (SEO)</strong>: CLMS products cater to a broad audience, including government agencies, NGOs, scientists, and the public. Properly formatted and exposed documentation allows these users to easily find and access data via search engines. Web crawlers can efficiently index CLMS products, simplifying the search for specific datasets without navigating complex databases.</p></li>
+<li><p><strong>Improved Product Findability Through AI and Chatbots</strong>: AI-powered search tools and chatbots use indexed information to generate responses. By ensuring that CLMS documentation is structured for crawling, CLMS products become accessible to third-party chatbots, expanding their reach through natural language queries and conversational interfaces.</p></li>
+<li><p><strong>Faster and More Accurate Data Retrieval</strong>: Well-formatted CLMS documents enable faster and more accurate data retrieval, essential for time-sensitive applications like disaster management. Proper crawling ensures that search engines and AI systems provide up-to-date CLMS products, crucial for timely decision-making.</p></li>
+<li><p><strong>Standardization and Interoperability</strong>: Adopting CLMS standards and formats like Quarto Markdown ensures consistency, making documents easier to index and retrieve. Standardization promotes interoperability, allowing CLMS data to be used across various platforms, including AI systems and environmental tools.</p></li>
+<li><p><strong>Global Reach and Broader Impact</strong>: Making CLMS documents available to web crawlers increases their global reach. Optimized data allows users worldwide to access key environmental information, contributing to global initiatives, research, and policymaking beyond the EU.</p></li>
+<li><p><strong>Supporting Third-Party Integration</strong>: Third-party platforms rely on web crawlers and AI tools to access CLMS data. By exposing CLMS products to crawlers, the data can be integrated into various tools and services, enhancing discoverability and promoting broader use in AI-driven analytics and public services.</p></li>
+</ul>
+<p>By making CLMS documents available to web crawlers using standardized formats such as HTML, PDF, and DOCX (which adhere to semantic structure, web standards, and use metadata), CLMS can ensure that its products are easily indexed, retrieved, and integrated into a variety of search engines, artificial intelligence systems, and chatbots. This strategy not only increases the visibility of CLMS products, but also improves accessibility to a global audience, ensuring that researchers, policymakers, and the public can effectively find and use CLMS data. At a time when timely, accurate environmental data is becoming increasingly important, optimizing CLMS products for web crawlers is a necessary step to ensure that everyone has access to these valuable resources.</p>
+</section>
+<section id="web-crawling-and-information-provisioning-for-generative-chatbots" class="level2">
+<h2 class="anchored" data-anchor-id="web-crawling-and-information-provisioning-for-generative-chatbots">1.3. Web crawling and Information Provisioning for Generative Chatbots</h2>
+<p>Web crawling is the process used by search engines to explore and index the web pages of websites. The crawler downloads pages, reads the content, and adds it to the search engine’s index. Crawlers are designed to navigate from one page to another by following hyperlinks, allowing them to efficiently cover a website’s entire structure. Search engines rely on crawlers to keep their results up-to-date by regularly visiting websites and checking for new or modified content. <strong>Googlebot</strong>, <strong>Bingbot</strong>, and <strong>Yahoo Slurp</strong> are some example of popular web crawlers. Key terms involved in web crawling are:</p>
+<ul>
+<li><strong>Search engine</strong>: A system that allows users to search for content on the web.</li>
+<li><strong>Indexing</strong>: The process of storing web content so it can be retrieved later.</li>
+<li><strong>Web pages</strong>: Documents that make up the web, interconnected by hyperlinks.</li>
+<li><strong>Hyperlinks</strong>: Links that connect different web pages, forming a navigable web.</li>
+</ul>
+<p>Web crawling has become essential for search engines and AI applications. The integration of these technologies has been explored extensively <span class="citation" data-cites="khder2021web massimino2016accessing kausar2013web saini2016information"><a href="#ref-khder2021web" role="doc-biblioref">[2]</a>, <a href="#ref-massimino2016accessing" role="doc-biblioref">[3]</a>, <a href="#ref-kausar2013web" role="doc-biblioref">[4]</a>, <a href="#ref-saini2016information" role="doc-biblioref">[5]</a></span>. The growth of digital content has placed significant demands on the efficiency and accuracy of web crawlers and artificial intelligence (AI) models <span class="citation" data-cites="hernandez2019deep deshmukh2021survey"><a href="#ref-hernandez2019deep" role="doc-biblioref">[6]</a>, <a href="#ref-deshmukh2021survey" role="doc-biblioref">[7]</a></span>. In response, Content Lifecycle Management Standards (CLMS) are essential for establishing uniformity in the way data is formatted, structured, and exposed for automated tools like crawlers and AI training datasets. CLMS helps ensure that content is easy to access, interpret, and process, leading to more accurate information retrieval and AI model training. This document outlines the development of CLMS standards for exposing information to web crawlers and optimizing the formatting for AI data ingestion. <a href="#fig-ai-training" class="quarto-xref">Figure&nbsp;1</a> focuses on the working of a web crawler <span class="citation" data-cites="Crawl"><a href="#ref-Crawl" role="doc-biblioref">[8]</a></span>.</p>
+<div id="fig-ai-training" class="quarto-float quarto-figure quarto-figure-center anchored" data-align="center">
+<figure class="quarto-float quarto-float-fig figure">
+<div aria-describedby="fig-ai-training-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
+<img src="images/web_crawling_diagram.png" class="img-fluid figure-img" style="width:65.0%" data-align="center">
+</div>
+<figcaption class="quarto-float-caption-bottom quarto-float-caption quarto-float-fig" id="fig-ai-training-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
+Figure&nbsp;1: Diagram illustrating web crawling <span class="citation" data-cites="Crawl"><a href="#ref-Crawl" role="doc-biblioref">[8]</a></span>.
+</figcaption>
+</figure>
+</div>
+<p>In recent years, generative chatbots have made great progress and become powerful tools that allow users to access detailed information and conduct complex queries. In particular, chatbots can help users explore certain aspects of CLMS products, such as allocation rules or the purpose of a particular product. These tools are not only critical for product discoverability, but also improve user understanding of CLMS products. To ensure that chatbots effectively help users find and understand CLMS products, it is important that the underlying information is formatted and presented in a way that is easy to find and use. This requires well-structured documentation and a system that allows web crawlers and AI models to effectively access and process CLMS data.</p>
+<p>Web crawlers and AI models are critical to the discoverability of online information. Web crawlers that index websites rely on well-structured content to perform their tasks effectively. Similarly, generative AI models, including chatbots, require high-quality structured data to produce accurate and meaningful results. CLMS provides important environmental data, but in order for this data to be useful to AI models and easy for users to find, it must be properly formatted and made available.</p>
+<section id="motivation" class="level3">
+<h3 class="anchored" data-anchor-id="motivation">1.3.1. Motivation</h3>
+<p>The relationship between AI and web crawlers has led to new frontiers in both industries. The primary motivation for creating CLMS standards lies in the need for:</p>
+<ul>
+<li><p><strong>Improved Crawling Efficiency</strong>: Properly formatted content with metadata helps crawlers index relevant information faster and more accurately.</p></li>
+<li><p><strong>Better AI Model Training</strong>: Consistent content structure ensures that AI models are trained on high-quality, organized data.</p></li>
+<li><p><strong>Data Accessibility</strong>: Standardizing the structure of content ensures that information is universally accessible across platforms.</p></li>
+</ul>
+<p>The following key aspects are critical for ensuring that data is structured and accessible for web crawlers and AI systems:</p>
+<ul>
+<li><p><strong>Uniform metadata</strong>: Consistent metadata usage across all content is essential. Metadata includes details like title, author, keywords, and publication date. Uniform metadata ensures that web crawlers and AI systems can easily index and categorize content, improving searchability and discoverability.</p></li>
+<li><p><strong>Clearly defined content sections</strong>: Content should be organized into distinct sections, such as titles, headings, and subheadings. This structured format helps both users and machines navigate through the content efficiently, making key information easy to locate and retrieve.</p></li>
+<li><p><strong>Embedded structured data formats</strong>: Incorporating structured data formats such as <strong>JSON-LD</strong>, <strong>RDF</strong>, or <strong>XML</strong> provides a precise way of representing information. These formats help web crawlers and AI systems understand relationships and attributes within the content, facilitating accurate extraction, interpretation, and use of the data across various platforms.</p></li>
+</ul>
+</section>
+<section id="importance" class="level3">
+<h3 class="anchored" data-anchor-id="importance">1.3.2. Importance</h3>
+<ul>
+<li><p><strong>Enhanced Web Crawling</strong>: Properly structured CLMS content will improve web crawlers’ ability to index and retrieve information.</p></li>
+<li><p><strong>Improved AI Training</strong>: Structured data ensures higher-quality datasets, which result in better-trained AI models, particularly for generative chatbots.</p></li>
+<li><p><strong>Better User Experience</strong>: By improving product discoverability and findability, users will have an easier time accessing and understanding CLMS products.</p></li>
+</ul>
+<div class="callout callout-style-default callout-tip callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Tip
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>Given the growing complexity of CLMS products and the increasing reliance on generative AI tools, it is critical to implement standards that improve the discoverability and usability of CLMS data.</p>
+</div>
+</div>
+<div class="callout callout-style-default callout-note callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Note
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>By standardizing the format and delivery of CLMS information, our goal is to ensure that generative AI applications, such as web crawlers and chatbots, can effectively access and use this data.</p>
+</div>
+</div>
+</section>
+</section>
+</section>
+<section id="content-standards" class="level1">
+<h1>2. Content Standards</h1>
+<p>Developing content standards requires collaboration between content creators, data engineers, and AI researchers. The process typically follows these stages for different document types in use:</p>
+<section id="content-structuring" class="level2">
+<h2 class="anchored" data-anchor-id="content-structuring">2.1. Content Structuring</h2>
+<p>Content structuring involves organizing data into recognizable, standard components, such as:</p>
+<ul>
+<li><p><strong>Title</strong>: Main identifier of the content.</p></li>
+<li><p><strong>Metadata</strong>: Information about the content, including authors, dates, keywords, and relevant classification.</p></li>
+<li><p><strong>Headings and Subheadings</strong>: Structured sections that break down the content into digestible parts.</p></li>
+</ul>
+<p>The example of Metadata formatting has been given below:</p>
+<div class="sourceCode" id="cb1"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="fu">title</span><span class="kw">:</span><span class="at"> </span><span class="st">"Developing CLMS Standards for Generative AI Training and Web Crawlers"</span></span>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="fu">subtitle</span><span class="kw">:</span><span class="at"> </span><span class="st">"Task 10.1: Information Provisioning for Generative Chatbots"</span></span>
+<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="fu">author</span><span class="kw">:</span><span class="at"> </span><span class="st">"Ayan Chatterjee, Department of DIGITAL, NILU, ayan@nilu.no."</span></span>
+<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="fu">date</span><span class="kw">:</span><span class="at"> </span><span class="st">"2024-09-10"</span></span>
+<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="fu">sitemap</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span><span class="co">           #Enables sitemap generation for web crawlers</span></span>
+<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a><span class="fu">toc</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span><span class="co">              # Enable the Table of Contents</span></span>
+<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a><span class="fu">toc-title</span><span class="kw">:</span><span class="at"> </span><span class="st">"Index"</span><span class="co">      # Customize the title of the table of contents</span></span>
+<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a><span class="fu">toc-depth</span><span class="kw">:</span><span class="at"> </span><span class="dv">3</span><span class="co">            # Include headings up to level 3 (</span><span class="al">###</span><span class="co">)</span></span>
+<span id="cb1-10"><a href="#cb1-10" aria-hidden="true" tabindex="-1"></a><span class="fu">keywords</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"CLMS standards"</span><span class="kw">,</span><span class="at"> </span><span class="st">"web crawlers"</span><span class="kw">,</span><span class="at"> </span><span class="st">"AI training"</span><span class="kw">,</span><span class="at"> </span><span class="st">"information formatting"</span><span class="kw">]</span></span>
+<span id="cb1-11"><a href="#cb1-11" aria-hidden="true" tabindex="-1"></a><span class="fu">bibliography</span><span class="kw">:</span><span class="at"> references.bib</span><span class="co">   # Link to the bibliography file</span></span>
+<span id="cb1-12"><a href="#cb1-12" aria-hidden="true" tabindex="-1"></a><span class="fu">csl</span><span class="kw">:</span><span class="at"> ieee.csl</span><span class="co">                  # Link to the CSL file for IEEE style</span></span>
+<span id="cb1-13"><a href="#cb1-13" aria-hidden="true" tabindex="-1"></a><span class="fu">format</span><span class="kw">:</span><span class="at"> </span></span>
+<span id="cb1-14"><a href="#cb1-14" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">html</span><span class="kw">:</span><span class="at"> default</span></span>
+<span id="cb1-15"><a href="#cb1-15" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">pdf</span><span class="kw">:</span><span class="at"> default</span></span>
+<span id="cb1-16"><a href="#cb1-16" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">docx</span><span class="kw">:</span><span class="at"> default</span></span>
+<span id="cb1-17"><a href="#cb1-17" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="html-structuring" class="level2">
+<h2 class="anchored" data-anchor-id="html-structuring">2.2. HTML Structuring</h2>
+<p>The following structured approach in HTML allows web crawlers to effectively index and retrieve content while facilitating AI training for generative models, ensuring that information is both accessible and usable:</p>
+<section id="semantic-structuring-and-formatting" class="level3">
+<h3 class="anchored" data-anchor-id="semantic-structuring-and-formatting">2.2.1. Semantic Structuring and Formatting</h3>
+<p>It is used to enhance both <strong>machine readability</strong> and <strong>user comprehension</strong>, we must follow structured and semantic formatting principles. This includes using HTML5 elements, schema markup, and providing clear metadata. Using HTML5 semantic elements like <code>&lt;article&gt;</code>, <code>&lt;section&gt;</code>, <code>&lt;header&gt;</code>, and <code>&lt;footer&gt;</code> helps structure the document meaningfully. For example:</p>
+<div class="sourceCode" id="cb2"><pre class="sourceCode html code-with-copy"><code class="sourceCode html"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="dt">&lt;</span><span class="kw">article</span><span class="dt">&gt;</span></span>
+<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a>  <span class="dt">&lt;</span><span class="kw">header</span><span class="dt">&gt;</span></span>
+<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a>    <span class="dt">&lt;</span><span class="kw">h1</span><span class="dt">&gt;</span>Understanding Web Crawlers<span class="dt">&lt;/</span><span class="kw">h1</span><span class="dt">&gt;</span></span>
+<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a>    <span class="dt">&lt;</span><span class="kw">meta</span><span class="ot"> name</span><span class="op">=</span><span class="st">"description"</span><span class="ot"> content</span><span class="op">=</span><span class="st">"How web crawlers work and index ..!"</span><span class="ot"> </span><span class="dt">/&gt;</span></span>
+<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a>  <span class="dt">&lt;/</span><span class="kw">header</span><span class="dt">&gt;</span></span>
+<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a>  <span class="dt">&lt;</span><span class="kw">section</span><span class="dt">&gt;</span></span>
+<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a>    <span class="dt">&lt;</span><span class="kw">h2</span><span class="dt">&gt;</span>How Crawlers Index Content<span class="dt">&lt;/</span><span class="kw">h2</span><span class="dt">&gt;</span></span>
+<span id="cb2-8"><a href="#cb2-8" aria-hidden="true" tabindex="-1"></a>    <span class="dt">&lt;</span><span class="kw">p</span><span class="dt">&gt;</span>Web crawlers use semantic structure to efficiently index web pages.<span class="dt">&lt;/</span><span class="kw">p</span><span class="dt">&gt;</span></span>
+<span id="cb2-9"><a href="#cb2-9" aria-hidden="true" tabindex="-1"></a>  <span class="dt">&lt;/</span><span class="kw">section</span><span class="dt">&gt;</span></span>
+<span id="cb2-10"><a href="#cb2-10" aria-hidden="true" tabindex="-1"></a>  <span class="dt">&lt;</span><span class="kw">footer</span><span class="dt">&gt;</span></span>
+<span id="cb2-11"><a href="#cb2-11" aria-hidden="true" tabindex="-1"></a>    <span class="dt">&lt;</span><span class="kw">p</span><span class="dt">&gt;</span>Author: Ayan Chatterjee<span class="dt">&lt;/</span><span class="kw">p</span><span class="dt">&gt;</span></span>
+<span id="cb2-12"><a href="#cb2-12" aria-hidden="true" tabindex="-1"></a>  <span class="dt">&lt;/</span><span class="kw">footer</span><span class="dt">&gt;</span></span>
+<span id="cb2-13"><a href="#cb2-13" aria-hidden="true" tabindex="-1"></a><span class="dt">&lt;/</span><span class="kw">article</span><span class="dt">&gt;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="microdata-for-enhancing-machine-readability" class="level3">
+<h3 class="anchored" data-anchor-id="microdata-for-enhancing-machine-readability">2.2.2. Microdata for Enhancing Machine Readability</h3>
+<p>Microdata attributes such as itemscope, itemtype, and itemprop provide semantic clarity for machines, enabling more efficient crawling and interpretation.</p>
+<div class="sourceCode" id="cb3"><pre class="sourceCode html code-with-copy"><code class="sourceCode html"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="dt">&lt;</span><span class="kw">article</span><span class="ot"> itemscope itemtype</span><span class="op">=</span><span class="st">"https://schema.org/Article"</span><span class="dt">&gt;</span></span>
+<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a>  <span class="dt">&lt;</span><span class="kw">header</span><span class="dt">&gt;</span></span>
+<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a>    <span class="dt">&lt;</span><span class="kw">h1</span><span class="ot"> itemprop</span><span class="op">=</span><span class="st">"headline"</span><span class="dt">&gt;</span>Web Crawling Explained<span class="dt">&lt;/</span><span class="kw">h1</span><span class="dt">&gt;</span></span>
+<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a>    <span class="dt">&lt;</span><span class="kw">meta</span><span class="ot"> itemprop</span><span class="op">=</span><span class="st">"description"</span><span class="ot"> content</span><span class="op">=</span><span class="st">"How web crawlers index ..?"</span><span class="ot"> </span><span class="dt">/&gt;</span></span>
+<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a>  <span class="dt">&lt;/</span><span class="kw">header</span><span class="dt">&gt;</span></span>
+<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a><span class="dt">&lt;/</span><span class="kw">article</span><span class="dt">&gt;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="schema-markup-for-structured-content" class="level3">
+<h3 class="anchored" data-anchor-id="schema-markup-for-structured-content">2.2.3. Schema Markup for Structured Content</h3>
+<p>Use Schema Markup (like ResearchArticle, Dataset, or CreativeWork) to define the content type and enhance machine readability. This helps both web crawlers and AI to categorize content accurately.</p>
+<div class="sourceCode" id="cb4"><pre class="sourceCode html code-with-copy"><code class="sourceCode html"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="dt">&lt;</span><span class="kw">article</span><span class="ot"> itemscope itemtype</span><span class="op">=</span><span class="st">"https://schema.org/ResearchArticle"</span><span class="dt">&gt;</span></span>
+<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>  <span class="dt">&lt;</span><span class="kw">header</span><span class="dt">&gt;</span></span>
+<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a>    <span class="dt">&lt;</span><span class="kw">h1</span><span class="ot"> itemprop</span><span class="op">=</span><span class="st">"headline"</span><span class="dt">&gt;</span>AI Training for Web Crawlers<span class="dt">&lt;/</span><span class="kw">h1</span><span class="dt">&gt;</span></span>
+<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a>    <span class="dt">&lt;</span><span class="kw">meta</span><span class="ot"> itemprop</span><span class="op">=</span><span class="st">"description"</span><span class="ot"> content</span><span class="op">=</span><span class="st">" AI training techniques for .."</span><span class="ot"> </span><span class="dt">/&gt;</span></span>
+<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a>  <span class="dt">&lt;/</span><span class="kw">header</span><span class="dt">&gt;</span></span>
+<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a><span class="dt">&lt;/</span><span class="kw">article</span><span class="dt">&gt;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="headings-and-subheadings" class="level3">
+<h3 class="anchored" data-anchor-id="headings-and-subheadings">2.2.4. Headings and Subheadings</h3>
+<p>Provide clearly defined headings and subheadings to organize content for easier navigation and indexing by crawlers.</p>
+<div class="sourceCode" id="cb5"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span>
+<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a><span class="co"># How AI Models are Trained</span></span>
+<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a><span class="co">## Data Collection</span></span>
+<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a><span class="co">## Model Training</span></span>
+<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a><span class="co">## Evaluation</span></span>
+<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="alt-text-and-descriptions" class="level3">
+<h3 class="anchored" data-anchor-id="alt-text-and-descriptions">2.2.5. Alt Text and Descriptions</h3>
+<p>For images and diagrams, always provide alt text and descriptions to improve accessibility.</p>
+<div class="sourceCode" id="cb6"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="at">![A diagram illustrating how web crawlers work]</span></span>
+<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a><span class="at">(images/web_crawlers.png){alt="A diagram of web crawler processes" width=50%}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="meta-tags-and-descriptions" class="level3">
+<h3 class="anchored" data-anchor-id="meta-tags-and-descriptions">2.2.6. Meta Tags and Descriptions</h3>
+<p>Add meta tags and descriptions to help web crawlers index the content more accurately</p>
+<div class="sourceCode" id="cb7"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="at">&lt;meta name="description" content="How web crawlers work effectively!" /&gt;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="phrasing-and-content-presentation" class="level3">
+<h3 class="anchored" data-anchor-id="phrasing-and-content-presentation">2.2.7. Phrasing and Content Presentation</h3>
+<p>Ensure that important keywords are present in titles, headings, and throughout the content without overusing them (avoid keyword stuffing).</p>
+<div class="sourceCode" id="cb8"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Introduction to Web Crawlers and AI Training</span></span>
+<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a><span class="at">Web crawlers, also known as spiders, are used by search engines to index web ...</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Write in a clear and concise manner. Avoid jargon unless necessary, and ensure that key concepts are easy to understand.</p>
+<div class="sourceCode" id="cb9"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="at">Web crawlers automatically scan websites to collect and index content. </span></span>
+<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a><span class="at">They follow links, downloading web pages and saving them for future queries.</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Use hyperlinks and cross-references to guide both users and web crawlers to related content.</p>
+<div class="sourceCode" id="cb10"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="at">For more details, see the [Introduction to AI Training](</span><span class="co">#data-collection).</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Provide a brief abstract or summary at the beginning of each article or section for better clarity and indexing.</p>
+<div class="sourceCode" id="cb11"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="ot">**Summary:**</span><span class="at"> This article provides an overview of indexing content, </span></span>
+<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a><span class="at">and their integration with AI.</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="structured-data-repositories" class="level3">
+<h3 class="anchored" data-anchor-id="structured-data-repositories">2.2.8. Structured Data Repositories</h3>
+<p>It is used to enable knowledge transfer to generative AI, use standardized formats like JSON-LD, RDF, or XML to define metadata and structure.</p>
+<div class="sourceCode" id="cb12"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="kw">{</span></span>
+<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">"@context"</span><span class="kw">:</span><span class="at"> </span><span class="st">"https://schema.org"</span><span class="kw">,</span></span>
+<span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">"@type"</span><span class="kw">:</span><span class="at"> </span><span class="st">"Dataset"</span><span class="kw">,</span></span>
+<span id="cb12-4"><a href="#cb12-4" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">"name"</span><span class="kw">:</span><span class="at"> </span><span class="st">"AI Training Dataset"</span><span class="kw">,</span></span>
+<span id="cb12-5"><a href="#cb12-5" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">"description"</span><span class="kw">:</span><span class="at"> </span><span class="st">"A dataset designed to improve search engine crawlers."</span></span>
+<span id="cb12-6"><a href="#cb12-6" aria-hidden="true" tabindex="-1"></a><span class="kw">}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode" id="cb13"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="at">&lt;dataset xmlns="http://www.w3.org/2001/XMLSchema-instance" type="AI Training Dataset"&gt;</span></span>
+<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a><span class="at">  &lt;name&gt;AI Training Dataset&lt;/name&gt;</span></span>
+<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a><span class="at">  &lt;description&gt;A dataset designed for training AI models.&lt;/description&gt;</span></span>
+<span id="cb13-4"><a href="#cb13-4" aria-hidden="true" tabindex="-1"></a><span class="at">&lt;/dataset&gt;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+</section>
+<section id="pdf-structuring" class="level2">
+<h2 class="anchored" data-anchor-id="pdf-structuring">2.3. PDF Structuring</h2>
+<p>The following structured approach in PDF will improve documents for indexing by web crawlers, integration with AI systems, and overall improved accessibility for users:</p>
+<section id="accessible-pdf-formats-by-tagging" class="level3">
+<h3 class="anchored" data-anchor-id="accessible-pdf-formats-by-tagging">2.3.1. Accessible PDF Formats by Tagging</h3>
+<p>Ensure that the PDF is tagged properly so that screen readers and AI tools can interpret the document structure. For instance, headings, paragraphs, and lists should be tagged semantically.</p>
+<div class="sourceCode" id="cb14"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Heading 1 (tagged as &lt;h1&gt;)</span></span>
+<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a><span class="kw">-</span><span class="at"> List item 1 (tagged as &lt;ul&gt;&lt;li&gt;)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="structuring-and-formatting" class="level3">
+<h3 class="anchored" data-anchor-id="structuring-and-formatting">2.3.2. Structuring and Formatting</h3>
+<p>The document structure should be accessible, with a clear hierarchy and a clickable table of contents (TOC). Accessible tagging, hierarchical organization, and text over image improve the usability for both humans and machines.</p>
+<p>Organize content into a well-defined hierarchy using headings (#, ##, ###). This improves both user navigation and machine parsing for AI and web crawlers.</p>
+<div class="sourceCode" id="cb15"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="co">## Section 1: Introduction</span></span>
+<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a><span class="co">### Subsection 1.1: Overview</span></span>
+<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a><span class="fu">toc</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
+<span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a><span class="fu">toc-depth</span><span class="kw">:</span><span class="at"> </span><span class="dv">2</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="adding-metadata" class="level3">
+<h3 class="anchored" data-anchor-id="adding-metadata">2.3.3. Adding Metadata</h3>
+<p>Embedding metadata such as document properties (e.g., Title, Author, Subject, and Keywords), XMP metadata, Schema.org metadata, and descriptive metadata helps search engines and AI systems index, categorize, and retrieve information efficiently.</p>
+<div class="sourceCode" id="cb16"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a><span class="fu">title</span><span class="kw">:</span><span class="at"> </span><span class="st">"PDF Structuring and Formatting"</span></span>
+<span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a><span class="fu">author</span><span class="kw">:</span><span class="at"> </span><span class="st">"Ayan Chatterjee"</span></span>
+<span id="cb16-3"><a href="#cb16-3" aria-hidden="true" tabindex="-1"></a><span class="fu">subject</span><span class="kw">:</span><span class="at"> </span><span class="st">"Document Accessibility and Metadata"</span></span>
+<span id="cb16-4"><a href="#cb16-4" aria-hidden="true" tabindex="-1"></a><span class="fu">keywords</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"PDF accessibility"</span><span class="kw">,</span><span class="at"> </span><span class="st">"metadata"</span><span class="kw">,</span><span class="at"> </span><span class="st">"AI integration"</span><span class="kw">]</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>XMP metadata is stored as XML in the PDF file, allowing for rich data descriptions. Schema.org metadata in JSON-LD provide structured information that AI and web crawlers can easily understand.</p>
+<div class="sourceCode" id="cb17"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a><span class="kw">{</span></span>
+<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">"@context"</span><span class="kw">:</span><span class="at"> </span><span class="st">"https://schema.org"</span><span class="kw">,</span></span>
+<span id="cb17-3"><a href="#cb17-3" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">"@type"</span><span class="kw">:</span><span class="at"> </span><span class="st">"CreativeWork"</span><span class="kw">,</span></span>
+<span id="cb17-4"><a href="#cb17-4" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">"name"</span><span class="kw">:</span><span class="at"> </span><span class="st">"PDF Structuring and Formatting"</span><span class="kw">,</span></span>
+<span id="cb17-5"><a href="#cb17-5" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">"author"</span><span class="kw">:</span><span class="at"> </span><span class="kw">{</span></span>
+<span id="cb17-6"><a href="#cb17-6" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">"@type"</span><span class="kw">:</span><span class="at"> </span><span class="st">"Person"</span><span class="kw">,</span></span>
+<span id="cb17-7"><a href="#cb17-7" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">"name"</span><span class="kw">:</span><span class="at"> </span><span class="st">"Jane Doe"</span></span>
+<span id="cb17-8"><a href="#cb17-8" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="kw">},</span></span>
+<span id="cb17-9"><a href="#cb17-9" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">"keywords"</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"PDF accessibility"</span><span class="kw">,</span><span class="at"> </span><span class="st">"metadata"</span><span class="kw">,</span><span class="at"> </span><span class="st">"AI integration"</span><span class="kw">]</span></span>
+<span id="cb17-10"><a href="#cb17-10" aria-hidden="true" tabindex="-1"></a><span class="kw">}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="optimizing-content-presentation" class="level3">
+<h3 class="anchored" data-anchor-id="optimizing-content-presentation">2.3.4. Optimizing Content Presentation</h3>
+<p>Ensuring the proper placement of keywords, providing alt text for images, and correctly labeling figures and tables contribute to the searchability and accessibility of the content. This is crucial for effective interaction with web crawlers and AI models.</p>
+<div class="sourceCode" id="cb18"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a><span class="fu">Keywords</span><span class="kw">:</span><span class="at"> PDF accessibility, web crawlers, generative AI</span></span>
+<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a><span class="at">![A flowchart showing the PDF processing workflow](path/to/image.png){alt="PDF workflow"}</span></span>
+<span id="cb18-3"><a href="#cb18-3" aria-hidden="true" tabindex="-1"></a><span class="fu">![Figure 1</span><span class="kw">:</span><span class="at"> A table of contents structure](path/to/image.png){</span><span class="co">#fig-toc}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="setting-up-for-knowledge-transfer-to-generative-ai" class="level3">
+<h3 class="anchored" data-anchor-id="setting-up-for-knowledge-transfer-to-generative-ai">2.3.5. Setting Up for Knowledge Transfer to Generative AI</h3>
+<p>Using machine-readable fonts (e.g., Arial, Times New Roman), a clean and simple layout, and adding comments or annotations helps prepare the document for use in generative AI systems. AI models benefit from well-structured and easy-to-parse content, which improves their ability to understand and generate meaningful responses based on the content.</p>
+<div class="sourceCode" id="cb19"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a><span class="co">## Section 1: Overview</span></span>
+<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a><span class="at">This section introduces the importance of accessible PDFs for AI processing...</span></span>
+<span id="cb19-3"><a href="#cb19-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb19-4"><a href="#cb19-4" aria-hidden="true" tabindex="-1"></a><span class="at">&lt;!-- This annotation explains the role of hierarchical metadata for AI --&gt;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="callout callout-style-default callout-important callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Important
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>By such structured practices, we can ensure that the content is both human-readable and machine-readable, facilitating easy discovery by web crawlers and seamless integration with AI training systems.</p>
+</div>
+</div>
+</section>
+</section>
+</section>
+<section id="developing-clms-standards" class="level1">
+<h1>3. Developing CLMS Standards</h1>
+<p>In the context of <strong>Developing CLMS Standards</strong>, it is essential to utilize advanced tools that support both the creation of well-structured documents and the easy discoverability of content for web crawlers and AI systems. Several tools are available for content formatting, documentation, and publication. Among these, <strong>Quarto</strong> stands out due to its versatility, allowing users to create, format, and publish documents in multiple formats (HTML, PDF, Word) with integrated support for code execution and structured content.</p>
+<p>This section compares several of these tools, explaining why <strong>Quarto</strong> is particularly suitable for creating CLMS-compliant documentation. We’ll also cover how to configure Quarto with <strong>RStudio</strong> and the importance of using <strong>Quarto Markdown</strong> for CLMS content. A Quarto Markdown file provides a structured approach to documenting the development of CLMS standards, ensuring content is easily accessible by both web crawlers and AI systems.</p>
+<section id="tools-for-clms-documentation" class="level2">
+<h2 class="anchored" data-anchor-id="tools-for-clms-documentation">3.1. Tools for CLMS Documentation</h2>
+<ul>
+<li><p><strong>Quarto</strong>: Quarto is a highly versatile tool for creating and publishing documents, including PDFs, with rich formatting, code integration, and support for multiple formats (HTML, PDF, Word). Quarto’s cross-platform capabilities make it ideal for creating structured and searchable documents for CLMS, supporting web crawlers and AI applications.</p></li>
+<li><p><strong>R Markdown</strong>: A popular tool in the R community that allows users to combine narrative text with R code, producing output in HTML, PDF, and Word formats. Though powerful for statistical analysis, it is more limited in non-R-based workflows compared to Quarto.</p></li>
+<li><p><strong>Jupyter Notebooks</strong>: An interactive tool supporting over 40 programming languages, commonly used for data science and computing. Notebooks can be exported to multiple formats (HTML, PDF, slides), but lack Quarto’s advanced content formatting features.</p></li>
+<li><p><strong>Pandoc</strong>: A universal document converter that enables conversion between various markup formats, including Markdown, LaTeX, and HTML. While powerful for conversions, Pandoc lacks the code integration and dynamic formatting of Quarto.</p></li>
+<li><p><strong>LaTeX</strong>: A document preparation system for producing scientific and technical documents. While highly customizable, it requires significant expertise and lacks the ease of Markdown tools like Quarto.</p></li>
+<li><p><strong>Hugo</strong>: A static site generator used for creating websites and blogs from Markdown files. While efficient for websites, it doesn’t provide the same level of document control and integration as Quarto.</p></li>
+<li><p><strong>Sphinx</strong>: A documentation generator mainly used for Python projects. It supports conversion to formats like HTML and PDF but lacks the cross-language support and document versatility of Quarto.</p></li>
+<li><p><strong>Bookdown</strong>: An extension of R Markdown, designed for writing books and long documents. It supports multiple output formats but is mostly R-focused, while Quarto supports multiple languages.</p></li>
+<li><p><strong>GitBook</strong>: A tool for creating documentation and books using Markdown. It allows collaboration but lacks the dynamic formatting and multi-language support found in Quarto.</p></li>
+<li><p><strong>Pelican</strong>: A static site generator that uses Markdown or reStructuredText. Best suited for blogs, it doesn’t provide the integrated support for complex documents required by CLMS standards.</p></li>
+<li><p><strong>Typora</strong>: A WYSIWYG Markdown editor that offers easy editing but lacks the advanced document control and integration capabilities that Quarto provides.</p></li>
+</ul>
+<p>The comparison of tools for CLMS documentation as shown in below <a href="#tbl-indexing" class="quarto-xref">Table&nbsp;1</a>. As shown in <a href="#tbl-indexing" class="quarto-xref">Table&nbsp;1</a>, Quarto outperforms other tools in terms of supported output formats and reproducibility.</p>
+<div id="tbl-indexing" class="quarto-float quarto-figure quarto-figure-center anchored">
+<figure class="quarto-float quarto-float-tbl figure">
+<figcaption class="quarto-float-caption-top quarto-float-caption quarto-float-tbl" id="tbl-indexing-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
+Table&nbsp;1: Comparative analysis of Quarto versus other formatting tools.
+</figcaption>
+<div aria-describedby="tbl-indexing-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
+<table class="caption-top table">
+<colgroup>
+<col style="width: 14%">
+<col style="width: 20%">
+<col style="width: 14%">
+<col style="width: 15%">
+<col style="width: 20%">
+<col style="width: 16%">
+</colgroup>
+<thead>
+<tr class="header">
+<th>Tool</th>
+<th>Cross-Language Support</th>
+<th>Output Formats</th>
+<th>Code Integration</th>
+<th>Static Site Generation</th>
+<th>Ideal Use Case</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td><strong>Quarto</strong></td>
+<td>Yes</td>
+<td>HTML, PDF, Word</td>
+<td>Yes</td>
+<td>Yes</td>
+<td>Reports, blogs, CLMS docs</td>
+</tr>
+<tr class="even">
+<td>R Markdown</td>
+<td>R only</td>
+<td>HTML, PDF, Word</td>
+<td>Yes (R)</td>
+<td>No</td>
+<td>Statistical reports</td>
+</tr>
+<tr class="odd">
+<td>Jupyter Notebooks</td>
+<td>40+ languages</td>
+<td>HTML, PDF</td>
+<td>Yes</td>
+<td>No</td>
+<td>Data Science</td>
+</tr>
+<tr class="even">
+<td>LaTeX</td>
+<td>Limited</td>
+<td>PDF, HTML</td>
+<td>No</td>
+<td>No</td>
+<td>Scientific papers</td>
+</tr>
+<tr class="odd">
+<td>Hugo</td>
+<td>No</td>
+<td>HTML</td>
+<td>No</td>
+<td>Yes</td>
+<td>Blogs, websites</td>
+</tr>
+<tr class="even">
+<td>Sphinx</td>
+<td>Python</td>
+<td>HTML, PDF</td>
+<td>No</td>
+<td>Yes</td>
+<td>Python documentation</td>
+</tr>
+</tbody>
+</table>
+</div>
+</figure>
+</div>
+</section>
+<section id="quarto-markdown" class="level2">
+<h2 class="anchored" data-anchor-id="quarto-markdown">3.2. Quarto Markdown</h2>
+<p>Markdown is a lightweight, easy-to-read syntax used for formatting plain text documents <span class="citation" data-cites="cookintroduction mati2023eviewsr paciorek2023example"><a href="#ref-cookintroduction" role="doc-biblioref">[9]</a>, <a href="#ref-mati2023eviewsr" role="doc-biblioref">[10]</a>, <a href="#ref-paciorek2023example" role="doc-biblioref">[11]</a></span>. In Quarto, Markdown is extended to support additional features beyond standard Markdown, allowing users to write text, integrate code, and generate richly formatted documents in various formats such as HTML, PDF, and Word <span class="citation" data-cites="cookintroduction mati2023eviewsr paciorek2023example"><a href="#ref-cookintroduction" role="doc-biblioref">[9]</a>, <a href="#ref-mati2023eviewsr" role="doc-biblioref">[10]</a>, <a href="#ref-paciorek2023example" role="doc-biblioref">[11]</a></span>. Quarto Markdown combines the simplicity of regular Markdown with powerful features for document rendering, making it ideal for data analysis, technical writing, academic papers, and reports <span class="citation" data-cites="cookintroduction mati2023eviewsr paciorek2023example"><a href="#ref-cookintroduction" role="doc-biblioref">[9]</a>, <a href="#ref-mati2023eviewsr" role="doc-biblioref">[10]</a>, <a href="#ref-paciorek2023example" role="doc-biblioref">[11]</a></span>.</p>
+<p>Quarto Markdown uses the standard Markdown syntax for headings, lists, emphasis, and links, while also supporting enhanced features like cross-referencing, citations, figures, tables, mathematical equations, and more <span class="citation" data-cites="cookintroduction mati2023eviewsr paciorek2023example"><a href="#ref-cookintroduction" role="doc-biblioref">[9]</a>, <a href="#ref-mati2023eviewsr" role="doc-biblioref">[10]</a>, <a href="#ref-paciorek2023example" role="doc-biblioref">[11]</a></span>. Quarto also allows for code execution in multiple programming languages (such as Python, R, and Julia) embedded within the Markdown file, enabling dynamic document creation where the outputs are generated directly from the code <span class="citation" data-cites="cookintroduction mati2023eviewsr paciorek2023example miroshnychenko2023quarto"><a href="#ref-cookintroduction" role="doc-biblioref">[9]</a>, <a href="#ref-mati2023eviewsr" role="doc-biblioref">[10]</a>, <a href="#ref-paciorek2023example" role="doc-biblioref">[11]</a>, <a href="#ref-miroshnychenko2023quarto" role="doc-biblioref">[12]</a></span>.</p>
+<p>Key features of <strong>Markdown</strong> in <strong>Quarto</strong> are:</p>
+<ul>
+<li><strong>Standard Markdown</strong>: Supports headings, lists, links, images, bold, italics, etc.</li>
+<li><strong>YAML Header</strong>: Allows users to specify metadata like title, author, date, and output formats (HTML, PDF, Word) at the start of the document.</li>
+<li><strong>Cross-references</strong>: Provides automatic numbering and referencing for figures, tables, sections, etc.</li>
+<li><strong>Code Execution</strong>: Integrates code cells for multiple programming languages, making it possible to run code and include its outputs directly in the document.</li>
+<li><strong>Mathematics and Equations</strong>: Supports LaTeX-style equations for technical writing.</li>
+<li><strong>Citations</strong>: Allows for referencing research papers and articles using BibTeX or CSL styles.</li>
+<li><strong>Multi-output Format</strong>: Enables seamless conversion to multiple formats like HTML, PDF, Word, presentations, and slides.</li>
+</ul>
+<section id="significance" class="level3">
+<h3 class="anchored" data-anchor-id="significance">3.2.1. Significance</h3>
+<p>Markdown in Quarto can be significant due to its <strong>simplicity and flexibility</strong> for CLMS documentation. With an <strong>easy-to-use syntax</strong>, it allows users to format text without requiring complex tools, making it accessible to both non-technical users and programmers. This flexibility enables the creation of a wide variety of documents, ranging from blog posts to scientific reports. Quarto extends standard Markdown by supporting <strong>rich formatting options</strong> essential for technical and academic writing, including built-in support for tables, figures, equations, footnotes, and cross-referencing. The <strong>integration of code and text</strong> is another powerful feature, allowing Quarto Markdown to embed code execution within documents. This is critical for reproducible research, enabling the inclusion of tables, charts, and figures generated directly from code, making it highly suitable for data science and technical reporting. Additionally, Quarto Markdown supports <strong>multi-format output</strong>, allowing users to create content once and export it to multiple formats like HTML, PDF, and Word, streamlining document preparation for different audiences. When used for online content, its structured format <strong>improves SEO (Search Engine Optimization)</strong>, making it easier for search engines to index and enhance discoverability. The ease of <strong>managing references, citations, and cross-references</strong> further strengthens its utility in academic and research documentation. Since Markdown files are plain text, Quarto seamlessly integrates with <strong>version control</strong> tools like Git, enabling easy <strong>collaboration</strong> among multiple contributors, especially in open-source and research communities. Finally, Quarto Markdown’s versatility in document creation extends across blogs, technical documentation, reports, scientific papers, and books, making it an ideal tool for content creators across various disciplines.</p>
+</section>
+<section id="configuring-quarto-with-rstudio" class="level3">
+<h3 class="anchored" data-anchor-id="configuring-quarto-with-rstudio">3.2.2. Configuring Quarto with RStudio</h3>
+<p>To integrate <strong>Quarto</strong> with <strong>RStudio</strong>:</p>
+<p><strong>Prerequisites</strong>:</p>
+<ol type="1">
+<li><strong>Install RStudio</strong>: Download and install RStudio from <a href="https://posit.co/download/rstudio-desktop/">RStudio Download</a>.</li>
+<li><strong>Install Quarto</strong>: Follow <a href="https://quarto.org/docs/get-started/">Quarto installation</a> to install the Quarto CLI.</li>
+</ol>
+<p><strong>Basic Setup in RStudio</strong>:</p>
+<ol type="1">
+<li><strong>Create a New Quarto Document</strong>:
+<ul>
+<li>In RStudio, go to <strong>File &gt; New File &gt; Quarto Document</strong>.</li>
+<li>Choose the type of document (e.g., HTML) and enter your title and metadata in the YAML header.</li>
+</ul></li>
+<li><strong>Save the File</strong>:
+<ul>
+<li>Save the file with a <code>.qmd</code> extension to ensure it is treated as a Quarto Markdown file.</li>
+</ul></li>
+<li><strong>YAML Header Configuration</strong>:
+<ul>
+<li>Configure the YAML header with essential metadata to optimize the document for web crawling.</li>
+</ul></li>
+</ol>
+<p><strong>Rendering</strong>:</p>
+<pre><code> You can directly write your content in RStudio and then render the *.qmd using Quarto to multiple formats:
+```bash   
+quarto render your-notebook.qmd --to html
+quarto render your-notebook.qmd --to pdf
+quarto render your-notebook.qmd --to docx   </code></pre>
+<p><strong>YAML Header in R Studio</strong>:</p>
+<pre><code>```yaml
+---
+title: "CLMS Data Analysis"
+author: "Ayan Chatterjee"
+format:
+  html: default
+  pdf: default
+  docx: default    
+---
+```</code></pre>
+</section>
+</section>
+<section id="indexing" class="level2">
+<h2 class="anchored" data-anchor-id="indexing">3.3. Indexing</h2>
+<p>Proper indexing is essential for increasing the discoverability and accessibility of CLMS products <span class="citation" data-cites="hassan2014improving coe2016website"><a href="#ref-hassan2014improving" role="doc-biblioref">[13]</a>, <a href="#ref-coe2016website" role="doc-biblioref">[14]</a></span>. By formatting documents using Quarto Markdown and generating a sitemap.xml, we can ensure that search engines and AI systems efficiently crawl and retrieve CLMS content <span class="citation" data-cites="hassan2014improving coe2016website"><a href="#ref-hassan2014improving" role="doc-biblioref">[13]</a>, <a href="#ref-coe2016website" role="doc-biblioref">[14]</a></span>. Top improve document indexing for enhanced discoverability and accessibility we can adopt the following approaches:</p>
+<ul>
+<li>Organize content using <strong>structured headers</strong> and <strong>metadata</strong> in Quarto Markdown.</li>
+<li>Use proper keywords and descriptions in the document metadata.</li>
+<li>Cross-reference related documents to create interconnected content that helps crawlers navigate.</li>
+</ul>
+<div class="sourceCode" id="cb22"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span>
+<span id="cb22-2"><a href="#cb22-2" aria-hidden="true" tabindex="-1"></a><span class="fu">title</span><span class="kw">:</span><span class="at"> </span><span class="st">"Land Use Mapping with CLMS Data"</span></span>
+<span id="cb22-3"><a href="#cb22-3" aria-hidden="true" tabindex="-1"></a><span class="fu">author</span><span class="kw">:</span><span class="at"> </span><span class="st">"Ayan Chatterjee"</span></span>
+<span id="cb22-4"><a href="#cb22-4" aria-hidden="true" tabindex="-1"></a><span class="fu">date</span><span class="kw">:</span><span class="at"> </span><span class="st">"2024-08-01"</span></span>
+<span id="cb22-5"><a href="#cb22-5" aria-hidden="true" tabindex="-1"></a><span class="fu">keywords</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"land use"</span><span class="kw">,</span><span class="at"> </span><span class="st">"CLMS"</span><span class="kw">,</span><span class="at"> </span><span class="st">"mapping"</span><span class="kw">,</span><span class="at"> </span><span class="st">"environment"</span><span class="kw">]</span></span>
+<span id="cb22-6"><a href="#cb22-6" aria-hidden="true" tabindex="-1"></a><span class="fu">description</span><span class="kw">:</span><span class="at"> </span><span class="st">"A detailed report on how CLMS data."</span></span>
+<span id="cb22-7"><a href="#cb22-7" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<section id="sitemap-generation" class="level3">
+<h3 class="anchored" data-anchor-id="sitemap-generation">3.3.1. Sitemap Generation</h3>
+<p>A sitemap.xml helps web crawlers discover all the content on the website <span class="citation" data-cites="hassan2014improving coe2016website"><a href="#ref-hassan2014improving" role="doc-biblioref">[13]</a>, <a href="#ref-coe2016website" role="doc-biblioref">[14]</a></span>. By providing a clear roadmap, crawlers can index each document, ensuring that all CLMS resources are available for search and AI training. By using <strong>Quarto Markdown</strong> and generating a <strong>sitemap.xml</strong>, CLMS documents can be structured in a way that improves their <strong>indexing</strong>, making them more <strong>discoverable</strong> by search engines and AI systems. This approach ensures efficient crawling, improves search engine ranking, and enhances the accessibility of CLMS products for users and AI models alike.</p>
+<ul>
+<li><strong>Search Engine Discoverability</strong>: Users and AI systems can easily find the indexed CLMS documents.</li>
+<li><strong>Efficient Crawling</strong>: The sitemap provides a roadmap, allowing for faster and more accurate indexing.</li>
+<li><strong>Increased Accessibility</strong>: Properly indexed documents are easier for users and AI to retrieve and utilize, improving the overall product visibility.</li>
+</ul>
+<div class="sourceCode" id="cb23"><pre class="sourceCode html code-with-copy"><code class="sourceCode html"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a><span class="dt">&lt;</span><span class="kw">urlset</span><span class="ot"> xmlns</span><span class="op">=</span><span class="st">"http://www.sitemaps.org/schemas/sitemap/0.9"</span><span class="dt">&gt;</span></span>
+<span id="cb23-2"><a href="#cb23-2" aria-hidden="true" tabindex="-1"></a>   <span class="dt">&lt;</span><span class="kw">url</span><span class="dt">&gt;</span></span>
+<span id="cb23-3"><a href="#cb23-3" aria-hidden="true" tabindex="-1"></a>      <span class="dt">&lt;</span><span class="kw">loc</span><span class="dt">&gt;</span>http://example.com/clms/land-use-mapping<span class="dt">&lt;/</span><span class="kw">loc</span><span class="dt">&gt;</span></span>
+<span id="cb23-4"><a href="#cb23-4" aria-hidden="true" tabindex="-1"></a>      <span class="dt">&lt;</span><span class="kw">lastmod</span><span class="dt">&gt;</span>2024-08-01<span class="dt">&lt;/</span><span class="kw">lastmod</span><span class="dt">&gt;</span></span>
+<span id="cb23-5"><a href="#cb23-5" aria-hidden="true" tabindex="-1"></a>      <span class="dt">&lt;</span><span class="kw">changefreq</span><span class="dt">&gt;</span>monthly<span class="dt">&lt;/</span><span class="kw">changefreq</span><span class="dt">&gt;</span></span>
+<span id="cb23-6"><a href="#cb23-6" aria-hidden="true" tabindex="-1"></a>   <span class="dt">&lt;/</span><span class="kw">url</span><span class="dt">&gt;</span></span>
+<span id="cb23-7"><a href="#cb23-7" aria-hidden="true" tabindex="-1"></a>   <span class="dt">&lt;</span><span class="kw">url</span><span class="dt">&gt;</span></span>
+<span id="cb23-8"><a href="#cb23-8" aria-hidden="true" tabindex="-1"></a>      <span class="dt">&lt;</span><span class="kw">loc</span><span class="dt">&gt;</span>http://example.com/clms/land-cover-change<span class="dt">&lt;/</span><span class="kw">loc</span><span class="dt">&gt;</span></span>
+<span id="cb23-9"><a href="#cb23-9" aria-hidden="true" tabindex="-1"></a>      <span class="dt">&lt;</span><span class="kw">lastmod</span><span class="dt">&gt;</span>2024-07-15<span class="dt">&lt;/</span><span class="kw">lastmod</span><span class="dt">&gt;</span></span>
+<span id="cb23-10"><a href="#cb23-10" aria-hidden="true" tabindex="-1"></a>      <span class="dt">&lt;</span><span class="kw">changefreq</span><span class="dt">&gt;</span>monthly<span class="dt">&lt;/</span><span class="kw">changefreq</span><span class="dt">&gt;</span></span>
+<span id="cb23-11"><a href="#cb23-11" aria-hidden="true" tabindex="-1"></a>   <span class="dt">&lt;/</span><span class="kw">url</span><span class="dt">&gt;</span></span>
+<span id="cb23-12"><a href="#cb23-12" aria-hidden="true" tabindex="-1"></a><span class="dt">&lt;/</span><span class="kw">urlset</span><span class="dt">&gt;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="steps-to-implement-and-submit-the-sitemap" class="level3">
+<h3 class="anchored" data-anchor-id="steps-to-implement-and-submit-the-sitemap">3.3.2. Steps to Implement and Submit the Sitemap</h3>
+<ul>
+<li><p><strong>Generate the Sitemap</strong>: Use a sitemap generator tool (e.g., XML-Sitemaps or Screaming Frog) to create a sitemap, or have it generated automatically by a CMS like WordPress or a static site generator like Hugo.</p></li>
+<li><p><strong>Upload the Sitemap</strong>: Once generated, place the sitemap.xml file in the root directory of your website, e.g., https://www.example.com/sitemap.xml.</p></li>
+<li><p><strong>Submit to Search Engines</strong>: Submit your sitemap to search engines via tools like Google Search Console and Bing Webmaster Tools. This helps search engines index your site properly.</p></li>
+</ul>
+</section>
+<section id="enhancing-indexing-for-web-crawlers-and-ai-models" class="level3">
+<h3 class="anchored" data-anchor-id="enhancing-indexing-for-web-crawlers-and-ai-models">3.3.3. Enhancing Indexing for Web Crawlers and AI Models</h3>
+<p>To ensure that CLMS documents are findable and accessible to web crawlers and AI models, it’s important to implement proper steps for generating and submitting a sitemap and using structured data (such as metadata and JSON-LD) to enhance indexing.</p>
+<ul>
+<li><p><strong>Descriptive Filenames</strong>: Use filenames that clearly describe the content of the document. For instance, instead of doc1.md, use clms-land-monitoring-data.md.</p></li>
+<li><p><strong>Metadata</strong>: Add descriptive metadata in your Quarto Markdown files (e.g., title, author, keywords). This helps search engines and AI models understand the content better.</p></li>
+<li><p><strong>Text Content</strong>: Ensure that text content is descriptive and structured using headings and subheadings to guide crawlers.</p></li>
+<li><p><strong>HTML Metadata and JSON-LD Structured Data</strong>: Use HTML metadata and JSON-LD structured data within the Quarto document to improve how your content is indexed by search engines and used by AI training systems.</p></li>
+</ul>
+<p>The following Quarto Markdown YAML header example demonstrates how to enhance document visibility for web crawling and AI training by including metadata and structured data. This can be part of your CLMS documentation to ensure that it is well-indexed and easy to discover.</p>
+<div class="sourceCode" id="cb24"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb24-1"><a href="#cb24-1" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span>
+<span id="cb24-2"><a href="#cb24-2" aria-hidden="true" tabindex="-1"></a><span class="fu">title</span><span class="kw">:</span><span class="at"> </span><span class="st">"CLMS Land Monitoring Data"</span></span>
+<span id="cb24-3"><a href="#cb24-3" aria-hidden="true" tabindex="-1"></a><span class="fu">author</span><span class="kw">:</span><span class="at"> </span><span class="st">"Ayan Chatterjee"</span></span>
+<span id="cb24-4"><a href="#cb24-4" aria-hidden="true" tabindex="-1"></a><span class="fu">date</span><span class="kw">:</span><span class="at"> </span><span class="st">"2024-09-15"</span></span>
+<span id="cb24-5"><a href="#cb24-5" aria-hidden="true" tabindex="-1"></a><span class="fu">keywords</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"CLMS"</span><span class="kw">,</span><span class="at"> </span><span class="st">"web crawling"</span><span class="kw">,</span><span class="at"> </span><span class="st">"AI training"</span><span class="kw">,</span><span class="at"> </span><span class="st">"environmental data"</span><span class="kw">]</span></span>
+<span id="cb24-6"><a href="#cb24-6" aria-hidden="true" tabindex="-1"></a><span class="fu">description</span><span class="kw">:</span><span class="at"> </span><span class="st">"Comprehensive overview of CLMS land monitoring datasets, ......"</span></span>
+<span id="cb24-7"><a href="#cb24-7" aria-hidden="true" tabindex="-1"></a><span class="fu">sitemap</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span><span class="co">  # Flag to include this document in the sitemap</span></span>
+<span id="cb24-8"><a href="#cb24-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb24-9"><a href="#cb24-9" aria-hidden="true" tabindex="-1"></a><span class="co"># HTML metadata for SEO and discoverability</span></span>
+<span id="cb24-10"><a href="#cb24-10" aria-hidden="true" tabindex="-1"></a><span class="fu">meta</span><span class="kw">:</span></span>
+<span id="cb24-11"><a href="#cb24-11" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="kw">-</span><span class="at"> </span><span class="fu">name</span><span class="kw">:</span><span class="at"> </span><span class="st">"description"</span></span>
+<span id="cb24-12"><a href="#cb24-12" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">content</span><span class="kw">:</span><span class="at"> </span><span class="st">"CLMS land monitoring datasets for environmental and climate ..."</span></span>
+<span id="cb24-13"><a href="#cb24-13" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="kw">-</span><span class="at"> </span><span class="fu">name</span><span class="kw">:</span><span class="at"> </span><span class="st">"keywords"</span></span>
+<span id="cb24-14"><a href="#cb24-14" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">content</span><span class="kw">:</span><span class="at"> </span><span class="st">"CLMS, land monitoring, environmental data, AI, web crawling"</span></span>
+<span id="cb24-15"><a href="#cb24-15" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb24-16"><a href="#cb24-16" aria-hidden="true" tabindex="-1"></a><span class="co"># JSON-LD structured data to help search engines and AI understand the content</span></span>
+<span id="cb24-17"><a href="#cb24-17" aria-hidden="true" tabindex="-1"></a><span class="fu">json-ld</span><span class="kw">:</span></span>
+<span id="cb24-18"><a href="#cb24-18" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="kw">-</span><span class="at"> </span><span class="st">"@context"</span><span class="kw">:</span><span class="at"> </span><span class="st">"https://schema.org"</span></span>
+<span id="cb24-19"><a href="#cb24-19" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">"@type"</span><span class="kw">:</span><span class="at"> </span><span class="st">"Dataset"</span></span>
+<span id="cb24-20"><a href="#cb24-20" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">"name"</span><span class="kw">:</span><span class="at"> </span><span class="st">"CLMS Land Monitoring Data"</span></span>
+<span id="cb24-21"><a href="#cb24-21" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">"description"</span><span class="kw">:</span><span class="at"> </span><span class="st">"Detailed data on land monitoring and ...."</span></span>
+<span id="cb24-22"><a href="#cb24-22" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">"url"</span><span class="kw">:</span><span class="at"> </span><span class="st">"https://www.example.com/clms-land-monitoring-data"</span></span>
+<span id="cb24-23"><a href="#cb24-23" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">"keywords"</span><span class="kw">:</span><span class="at"> </span><span class="st">"land monitoring, environmental data, AI training.."</span></span>
+<span id="cb24-24"><a href="#cb24-24" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">"datePublished"</span><span class="kw">:</span><span class="at"> </span><span class="st">"2024-09-15"</span></span>
+<span id="cb24-25"><a href="#cb24-25" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">"creator"</span><span class="kw">:</span></span>
+<span id="cb24-26"><a href="#cb24-26" aria-hidden="true" tabindex="-1"></a><span class="at">      </span><span class="fu">"@type"</span><span class="kw">:</span><span class="at"> </span><span class="st">"Organization"</span></span>
+<span id="cb24-27"><a href="#cb24-27" aria-hidden="true" tabindex="-1"></a><span class="at">      </span><span class="fu">"name"</span><span class="kw">:</span><span class="at"> </span><span class="st">"Copernicus Land Monitoring Service"</span></span>
+<span id="cb24-28"><a href="#cb24-28" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">"publisher"</span><span class="kw">:</span></span>
+<span id="cb24-29"><a href="#cb24-29" aria-hidden="true" tabindex="-1"></a><span class="at">      </span><span class="fu">"@type"</span><span class="kw">:</span><span class="at"> </span><span class="st">"Organization"</span></span>
+<span id="cb24-30"><a href="#cb24-30" aria-hidden="true" tabindex="-1"></a><span class="at">      </span><span class="fu">"name"</span><span class="kw">:</span><span class="at"> </span><span class="st">"European Environment Agency"</span></span>
+<span id="cb24-31"><a href="#cb24-31" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="callout callout-style-default callout-important callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Important
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p><strong>Quarto</strong> stands out as the most versatile tool for creating CLMS-compliant documents, with cross-language support, integration of code, multiple output formats, and the ability to generate static websites.</p>
+</div>
+</div>
+<div class="callout callout-style-default callout-important callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Important
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>To ensure that CLMS documents are findable and accessible to web crawlers and AI models, it’s important to implement proper steps for generating and submitting a sitemap and using structured data (such as metadata and JSON-LD) to enhance indexing.</p>
+</div>
+</div>
+</section>
+</section>
+</section>
+<section id="recommended-standards-for-information-formatting" class="level1">
+<h1>4. Recommended Standards for Information Formatting</h1>
+<section id="suggested-standards" class="level2">
+<h2 class="anchored" data-anchor-id="suggested-standards">4.1. Suggested Standards</h2>
+<p>One of the main challenges in this task is improving the findability and discoverability of CLMS products. With the extensive range of data and services offered by CLMS, users often struggle to locate specific datasets or resources. Chatbots serve as a potential solution by guiding users to the appropriate resources. For chatbots to effectively perform this role, the data must be properly structured, categorized, and indexed. To support this:</p>
+<ul>
+<li><p>Documentation must be <strong>accessible to third-party chatbots</strong>. While CLMS chatbots will be the primary interaction point, external platforms should also access and retrieve relevant data. Exposing CLMS data in a structured and standardized format ensures interoperability across various chatbot systems, enhancing discoverability.</p></li>
+<li><p>Recommendations will be provided on how CLMS should <strong>format and expose information</strong>. These guidelines will focus on best practices for metadata structuring, content organization, and linkable resources to optimize data formatting.</p></li>
+</ul>
+<p>The recommended standards for CLMS will include the use of Quarto Markdown, sitemaps, and structured metadata for web crawlers and AI systems.</p>
+<ul>
+<li><p><strong>Using Quarto Markdown for Data Structuring</strong>: Quarto Markdown allows for the clear organization of data, with structured sections such as headings, subheadings, and metadata fields. This makes it easier for web crawlers and AI systems to navigate the content and retrieve relevant information. Additionally, by using cross-referencing within Quarto Markdown documents, CLMS products and resources can be interconnected, providing users with a more seamless exploration experience.</p></li>
+<li><p><strong>Implementing Sitemaps for Efficient Crawling</strong>: Sitemaps provide a roadmap for web crawlers, ensuring that all relevant pages and data sources are indexed. By creating comprehensive sitemaps that expose the entirety of the CLMS data repository, the task ensures that web crawlers and AI systems can efficiently discover and retrieve content. This is essential for making CLMS data easily accessible to third-party chatbots and AI platforms.</p></li>
+</ul>
+</section>
+<section id="guideline-for-the-process-verification" class="level2">
+<h2 class="anchored" data-anchor-id="guideline-for-the-process-verification">4.2. Guideline for the Process Verification</h2>
+<p>We can compare the results of the search queries for both the unformatted and formatted documents. Typically, formatted documents with clear structure and metadata should provide better search accuracy because they provide more semantic meaning and context, making it easier for the search engine to retrieve relevant information. In this sub-section, we have outlined a step-by-step process for preparing and indexing documents to improve search accuracy. The focus is on comparing unformatted documents to formatted ones using Quarto Markdown, and how sitemap integration enhances search engine results.</p>
+<ul>
+<li>Step 1: Document Preparation
+<ul>
+<li>Create <strong>unformatted text</strong>, <strong>PDF</strong>, or <strong>Markdown files</strong>.</li>
+<li>Create <strong>formatted documents</strong> using <strong>Quarto</strong>, which include metadata, clear headings, and semantic structure.</li>
+</ul></li>
+<li>Step 2: Sitemap Generation
+<ul>
+<li>For the formatted documents, generate a <strong>sitemap</strong> in XML format.</li>
+<li>The sitemap should list all document URLs along with relevant metadata (e.g., last modified date, frequency of changes).</li>
+</ul></li>
+<li>Step 3: Set Up Search Engine
+<ul>
+<li>Choose a simple search engine library, such as <strong>Whoosh</strong>.</li>
+<li>Create a <strong>search index</strong> for both sets of documents (formatted and unformatted).</li>
+<li>Ensure that metadata is included in the search index for the formatted documents.</li>
+</ul></li>
+<li>Step 4: Develop Web Crawler
+<ul>
+<li>Write a simple <strong>web crawler</strong> to crawl both unformatted and formatted documents.</li>
+<li>For the formatted documents, ensure the crawler uses the <strong>sitemap</strong> to guide the indexing process.</li>
+</ul></li>
+<li>Step 5: Test Search Accuracy
+<ul>
+<li>Perform search queries for common terms in both unformatted and formatted datasets.</li>
+<li>Measure the relevance of search results using metrics like <strong>precision</strong>, <strong>recall</strong>, and <strong>F1 score</strong>.</li>
+</ul></li>
+<li>Step 6: Analyze Results
+<ul>
+<li>Compare the performance of the search engine on unformatted versus formatted documents.</li>
+<li><strong>Hypothesis</strong>: Documents with structure and a sitemap will produce better search accuracy, yielding higher relevance in the results.</li>
+</ul></li>
+</ul>
+<p>This Quarto Markdown setup can be used in a RStudio (*.qmd) under a single section, maintaining clarity and structure in both the notebook and final rendered outputs (e.g., HTML, PDF, or DOCX).</p>
+</section>
+</section>
+<section id="conclusion" class="level1">
+<h1>5. Conclusion</h1>
+<p>The <strong>European Environment Agency (EEA)</strong> recognizes the growing need for generative chatbots and natural language analysis tools to facilitate easy access to CLMS data. In response, the EEA is undertaking preparatory efforts to establish the necessary standards and infrastructure for successful chatbot integration. These activities focus on ensuring that CLMS products are <strong>findable</strong> and <strong>discoverable</strong>, enabling users, regardless of technical expertise, to access environmental data seamlessly.</p>
+<p>A key part of this strategy is making CLMS documentation and data accessible to third-party generative AI platforms. By implementing standards for formatting and exposing information—particularly through <strong>Quarto Markdown</strong> and <strong>sitemaps</strong>—CLMS ensures that high-quality, structured data is available to chatbots and AI systems. This not only enhances product discoverability but also improves user experience, allowing chatbots to guide users through complex datasets and environmental resources.</p>
+<p>The collaboration between CLMS and the EEA lays the groundwork for a future where AI systems can efficiently retrieve and process environmental data, supporting informed decision-making and increasing public engagement with CLMS products.</p>
+</section>
+<section id="references" class="level1">
+<h1>6. References</h1>
+</section>
+</div>
+
+
+<div id="quarto-appendix" class="default"><section class="quarto-appendix-contents" role="doc-bibliography" id="quarto-bibliography"><h2 class="anchored quarto-appendix-heading">References</h2><div id="refs" class="references csl-bib-body" data-entry-spacing="0" role="list">
+<div id="ref-CLMS" class="csl-entry" role="listitem">
+<div class="csl-left-margin">[1] </div><div class="csl-right-inline">E. Project, <span>“CLMS - copernicus land monitoring service.”</span> 2024. Available: <a href="https://land.copernicus.eu/en">https://land.copernicus.eu/en</a></div>
+</div>
+<div id="ref-khder2021web" class="csl-entry" role="listitem">
+<div class="csl-left-margin">[2] </div><div class="csl-right-inline">M. A. Khder, <span>“Web scraping or web crawling: State of art, techniques, approaches and application.”</span> <em>International Journal of Advances in Soft Computing &amp; Its Applications</em>, vol. 13, no. 3, 2021.</div>
+</div>
+<div id="ref-massimino2016accessing" class="csl-entry" role="listitem">
+<div class="csl-left-margin">[3] </div><div class="csl-right-inline">B. Massimino, <span>“Accessing online data: Web-crawling and information-scraping techniques to automate the assembly of research data,”</span> <em>Journal of Business Logistics</em>, vol. 37, no. 1, pp. 34–42, 2016.</div>
+</div>
+<div id="ref-kausar2013web" class="csl-entry" role="listitem">
+<div class="csl-left-margin">[4] </div><div class="csl-right-inline">M. A. Kausar, V. Dhaka, and S. K. Singh, <span>“Web crawler: A review,”</span> <em>International Journal of Computer Applications</em>, vol. 63, no. 2, pp. 31–36, 2013.</div>
+</div>
+<div id="ref-saini2016information" class="csl-entry" role="listitem">
+<div class="csl-left-margin">[5] </div><div class="csl-right-inline">C. Saini and V. Arora, <span>“Information retrieval in web crawling: A survey,”</span> in <em>2016 international conference on advances in computing, communications and informatics (ICACCI)</em>, IEEE, 2016, pp. 2635–2643.</div>
+</div>
+<div id="ref-hernandez2019deep" class="csl-entry" role="listitem">
+<div class="csl-left-margin">[6] </div><div class="csl-right-inline">I. Hernández, C. R. Rivero, and D. Ruiz, <span>“Deep web crawling: A survey,”</span> <em>World Wide Web</em>, vol. 22, pp. 1577–1610, 2019.</div>
+</div>
+<div id="ref-deshmukh2021survey" class="csl-entry" role="listitem">
+<div class="csl-left-margin">[7] </div><div class="csl-right-inline">S. Deshmukh and K. Vishwakarma, <span>“A survey on crawlers used in developing search engine,”</span> in <em>2021 5th international conference on intelligent computing and control systems (ICICCS)</em>, IEEE, 2021, pp. 1446–1452.</div>
+</div>
+<div id="ref-Crawl" class="csl-entry" role="listitem">
+<div class="csl-left-margin">[8] </div><div class="csl-right-inline">Octoparse, <span>“Web crawl.”</span> 2024. Available: <a href="https://www.octoparse.com/">https://www.octoparse.com/</a></div>
+</div>
+<div id="ref-cookintroduction" class="csl-entry" role="listitem">
+<div class="csl-left-margin">[9] </div><div class="csl-right-inline">J. J. Cook, <span>“An introduction to quarto: A versatile open-source tool for data reporting and visualization.”</span></div>
+</div>
+<div id="ref-mati2023eviewsr" class="csl-entry" role="listitem">
+<div class="csl-left-margin">[10] </div><div class="csl-right-inline">S. Mati, I. Civcir, and S. I. Abba, <span>“EviewsR: An r package for dynamic and reproducible research using EViews, r, r markdown and quarto.”</span> <em>R Journal</em>, vol. 15, no. 2, 2023.</div>
+</div>
+<div id="ref-paciorek2023example" class="csl-entry" role="listitem">
+<div class="csl-left-margin">[11] </div><div class="csl-right-inline">C. Paciorek, <span>“An example quarto markdown file,”</span> 2023.</div>
+</div>
+<div id="ref-miroshnychenko2023quarto" class="csl-entry" role="listitem">
+<div class="csl-left-margin">[12] </div><div class="csl-right-inline">I. Miroshnychenko, <span>“QUARTO: REVOLUTIONIZING CONTENT CREATION,”</span> <em>Volume editor: Vitaliy Snytyuk, Dr. Sc., Prof. Program Committee: Aldrich Chris, Andreas Pester, Frederic Mallet, Hiroshi Tanaka, Iurii Krak, Yulia Khlevna, Karsten Henke, Oleg Chertov, Oleksandr Kuchanskyi, Oleksandr Marchenko, S<span>á</span>ndor Boz<span>ó</span>ki, Vitaliy Tsyganok, Vladimir Vovk Organizing Committee: Anatoly Anisimov, Vitaliy Snytyuk, Oleksii Bychkov, Oleh Ilarionov, Yuriі</em>, p. 189, 2023.</div>
+</div>
+<div id="ref-hassan2014improving" class="csl-entry" role="listitem">
+<div class="csl-left-margin">[13] </div><div class="csl-right-inline">R. F. Hassan and S. Hussain, <span>“Improving the web indexing quality through a website-search engine coactions,”</span> <em>International Journal of Computer and Information Technology</em>, vol. 3, no. 2, 2014.</div>
+</div>
+<div id="ref-coe2016website" class="csl-entry" role="listitem">
+<div class="csl-left-margin">[14] </div><div class="csl-right-inline">M. Coe, <span>“Website indexing,”</span> <em>The Indexer: The International Journal of Indexing</em>, vol. 34, no. 1, pp. 20–25, 2016.</div>
+</div>
+</div></section></div></main>
+<!-- /main column -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    // For code content inside modals, clipBoardJS needs to be initialized with a container option
+    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp('/' + window.location.host + '/');
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      if (note) {
+        return note.innerHTML;
+      } else {
+        return "";
+      }
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/CLMS_filenamingconvention.html b/CLMS_filenamingconvention.html
new file mode 100644
index 0000000..ff79222
--- /dev/null
+++ b/CLMS_filenamingconvention.html
@@ -0,0 +1,762 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en-UK" xml:lang="en-UK"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.5.56">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+<meta name="author" content="CLMS ICT-WG">
+<meta name="dcterms.date" content="2024-09-13">
+<meta name="keywords" content="guidelines, filenaming, reference, CLMS, File Naming Standards, Naming Conventions, File Structure, File Management, ICT Guidelines, Best Practices, File Systems Compatibility, geodata storage, Metadata, Information Retrieval, Interoperability, File Extension, File Identifier">
+
+<title>CLMS filenaming convention</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+</style>
+
+
+<script src="CLMS_filenamingconvention_files/libs/clipboard/clipboard.min.js"></script>
+<script src="CLMS_filenamingconvention_files/libs/quarto-html/quarto.js"></script>
+<script src="CLMS_filenamingconvention_files/libs/quarto-html/popper.min.js"></script>
+<script src="CLMS_filenamingconvention_files/libs/quarto-html/tippy.umd.min.js"></script>
+<script src="CLMS_filenamingconvention_files/libs/quarto-html/anchor.min.js"></script>
+<link href="CLMS_filenamingconvention_files/libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="CLMS_filenamingconvention_files/libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="CLMS_filenamingconvention_files/libs/bootstrap/bootstrap.min.js"></script>
+<link href="CLMS_filenamingconvention_files/libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="CLMS_filenamingconvention_files/libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+
+
+<link rel="stylesheet" href="styles.css">
+</head>
+
+<body>
+
+<div id="quarto-content" class="page-columns page-rows-contents page-layout-article">
+<div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+  <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">Table of contents</h2>
+   
+  <ul class="collapse">
+  <li><a href="#preface-and-terminology" id="toc-preface-and-terminology" class="nav-link active" data-scroll-target="#preface-and-terminology"><span class="header-section-number">1</span> Preface and terminology</a></li>
+  <li><a href="#sec-fully-qualified-file-name" id="toc-sec-fully-qualified-file-name" class="nav-link" data-scroll-target="#sec-fully-qualified-file-name"><span class="header-section-number">2</span> Fully Qualified File Name</a>
+  <ul class="collapse">
+  <li><a href="#host-or-device-name" id="toc-host-or-device-name" class="nav-link" data-scroll-target="#host-or-device-name"><span class="header-section-number">2.1</span> host or device (name)</a></li>
+  <li><a href="#path-name" id="toc-path-name" class="nav-link" data-scroll-target="#path-name"><span class="header-section-number">2.2</span> path (name)</a></li>
+  <li><a href="#base-or-stem-filename" id="toc-base-or-stem-filename" class="nav-link" data-scroll-target="#base-or-stem-filename"><span class="header-section-number">2.3</span> (base or stem) filename</a></li>
+  <li><a href="#extension-or-suffix" id="toc-extension-or-suffix" class="nav-link" data-scroll-target="#extension-or-suffix"><span class="header-section-number">2.4</span> extension or suffix</a></li>
+  </ul></li>
+  <li><a href="#sec-filename-and-path-length" id="toc-sec-filename-and-path-length" class="nav-link" data-scroll-target="#sec-filename-and-path-length"><span class="header-section-number">3</span> Filename and path length</a>
+  <ul class="collapse">
+  <li><a href="#sec-best-practices-for-filename-length" id="toc-sec-best-practices-for-filename-length" class="nav-link" data-scroll-target="#sec-best-practices-for-filename-length"><span class="header-section-number">3.1</span> Recommendation on filename length</a></li>
+  </ul></li>
+  <li><a href="#sec-the-filename" id="toc-sec-the-filename" class="nav-link" data-scroll-target="#sec-the-filename"><span class="header-section-number">4</span> The Filename</a>
+  <ul class="collapse">
+  <li><a href="#sec-filename---allowed-characters" id="toc-sec-filename---allowed-characters" class="nav-link" data-scroll-target="#sec-filename---allowed-characters"><span class="header-section-number">4.1</span> Filename - ASCII characters</a></li>
+  <li><a href="#sec-fields---the-elements-of-a-filename" id="toc-sec-fields---the-elements-of-a-filename" class="nav-link" data-scroll-target="#sec-fields---the-elements-of-a-filename"><span class="header-section-number">4.2</span> Fields - the elements of a filename</a></li>
+  <li><a href="#sec-filename---structural-rules" id="toc-sec-filename---structural-rules" class="nav-link" data-scroll-target="#sec-filename---structural-rules"><span class="header-section-number">4.3</span> Filename - structural rules</a></li>
+  </ul></li>
+  </ul>
+<div class="quarto-alternate-formats"><h2>Other Formats</h2><ul><li><a href="CLMS_filenamingconvention.pdf"><i class="bi bi-file-pdf"></i>PDF</a></li><li><a href="CLMS_filenamingconvention.docx"><i class="bi bi-file-word"></i>MS Word</a></li></ul></div></nav>
+</div>
+<main class="content" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default">
+<div class="quarto-title">
+<h1 class="title">CLMS filenaming convention</h1>
+<p class="subtitle lead">guidelines</p>
+</div>
+
+
+
+<div class="quarto-title-meta">
+
+    <div>
+    <div class="quarto-title-meta-heading">Author</div>
+    <div class="quarto-title-meta-contents">
+             <p>CLMS ICT-WG </p>
+          </div>
+  </div>
+    
+    <div>
+    <div class="quarto-title-meta-heading">Published</div>
+    <div class="quarto-title-meta-contents">
+      <p class="date">September 13, 2024</p>
+    </div>
+  </div>
+  
+    
+  </div>
+  
+
+<div>
+  <div class="keywords">
+    <div class="block-title">Keywords</div>
+    <p>guidelines, filenaming, reference, CLMS, File Naming Standards, Naming Conventions, File Structure, File Management, ICT Guidelines, Best Practices, File Systems Compatibility, geodata storage, Metadata, Information Retrieval, Interoperability, File Extension, File Identifier</p>
+  </div>
+</div>
+
+</header>
+
+
+<!-- Watermark: This will be displayed on all pages -->
+<div class="watermark">
+DRAFT
+</div>
+<section id="preface-and-terminology" class="level1" data-number="1">
+<h1 data-number="1"><span class="header-section-number">1</span> Preface and terminology</h1>
+<p>To uniquely identify a file on a host or device a string of characters is used, which in its full or absolute length consists of several distinct parts. If on a host whose exact name is completely specified such that it is unambiguous and cannot be mistaken for any other file on that host or device this defines a “fully qualified file name” or FQFN (see <a href="#sec-fully-qualified-file-name" class="quarto-xref">Section&nbsp;2</a>).</p>
+<p>Unfortunately, terminology around file identifiers is not fully harmonized and can be quite confusing as e.g., the term ‘filename’ is commonly used for either the FQFN, but also the base name of a file with and without an extension.</p>
+<p>For proper disambiguation a definition of all relevant terms as used in this document is given here first.</p>
+</section>
+<section id="sec-fully-qualified-file-name" class="level1" data-number="2">
+<h1 data-number="2"><span class="header-section-number">2</span> Fully Qualified File Name</h1>
+<p>A fully qualified file name or FQFN<a href="#fn1" class="footnote-ref" id="fnref1" role="doc-noteref"><sup>1</sup></a> is a string of characters suited to entirely and uniquely identify a resource on a host or device. To fulfill its purpose the FQFN must specify device, path, file, and extension.</p>
+<section id="host-or-device-name" class="level2" data-number="2.1">
+<h2 data-number="2.1" class="anchored" data-anchor-id="host-or-device-name"><span class="header-section-number">2.1</span> host or device (name)</h2>
+<p>String refers to the name of the server or machine on the network where the file is stored</p>
+</section>
+<section id="path-name" class="level2" data-number="2.2">
+<h2 data-number="2.2" class="anchored" data-anchor-id="path-name"><span class="header-section-number">2.2</span> path (name)</h2>
+<p>String identifying the folder and subfolder in which file is stored</p>
+</section>
+<section id="base-or-stem-filename" class="level2" data-number="2.3">
+<h2 data-number="2.3" class="anchored" data-anchor-id="base-or-stem-filename"><span class="header-section-number">2.3</span> (base or stem) filename</h2>
+<p>String identifying an individual file, without the suffix which is referred to as extension.</p>
+</section>
+<section id="extension-or-suffix" class="level2" data-number="2.4">
+<h2 data-number="2.4" class="anchored" data-anchor-id="extension-or-suffix"><span class="header-section-number">2.4</span> extension or suffix</h2>
+<p>String indicating a characteristic (file type) of the file contents or its intended use, usually found at the end of a file URI.</p>
+<div class="callout callout-style-simple callout-note no-icon callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+The FQFN and its elements:
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p><em>host://path/to/file/<strong>filename</strong>.extension</em></p>
+</div>
+</div>
+<p>For the purpose of this document the term ‘filename’ relates to all characters of a FQFN preceding the first occurrence to the extension delimiter character ‘full stop’ or ‘period’ (“.”, ASCII 46) and not being preceded by the last occurrence of the path delimiter most commonly the ‘slash’ (“/”, ASCII code 47) or the ‘backslash’ character (“\”, ASCII code 92).</p>
+</section>
+</section>
+<section id="sec-filename-and-path-length" class="level1" data-number="3">
+<h1 data-number="3"><span class="header-section-number">3</span> Filename and path length</h1>
+<p>The maximum FQFN length is determined by the file system. File systems and their upper limits are:</p>
+<ul>
+<li><p><strong>Windows (Windows 32 bit API</strong><a href="#fn2" class="footnote-ref" id="fnref2" role="doc-noteref"><sup>2</sup></a>)</p>
+<ul>
+<li><p>Maximum FQFN length: 260 characters (including drive letter, backslashes, and null terminator).</p></li>
+<li><p>Maximum length of file name and file extension: 255 characters.</p></li>
+</ul></li>
+<li><p><strong>Linux (ext4)</strong></p>
+<ul>
+<li><p>Maximum FQFN length: 4096 characters.</p></li>
+<li><p>Maximum length of file name and file extension: 255 characters.</p></li>
+</ul></li>
+<li><p><strong>macOS (APFS)</strong></p>
+<ul>
+<li><p>Maximum FQFN length: 1024 characters.</p></li>
+<li><p>Maximum length of file name and file extension: 255 characters.</p></li>
+</ul></li>
+</ul>
+<section id="sec-best-practices-for-filename-length" class="level2" data-number="3.1">
+<h2 data-number="3.1" class="anchored" data-anchor-id="sec-best-practices-for-filename-length"><span class="header-section-number">3.1</span> Recommendation on filename length</h2>
+<p>To ensure compatibility across all systems the smallest boundaries must be taken, i.e.,</p>
+<ul>
+<li><p>260 characters maximum FQFN length as defined by Windows Windows 32 bit API .</p></li>
+<li><p>255 characters for maximum length of file name and file extension.</p></li>
+</ul>
+<p>To safely accommodate both, file name and path, it is generally recommended to keep file names well under these maximum limits and aiming for a:</p>
+<div class="callout callout-style-simple callout-note no-icon">
+<div class="callout-body d-flex">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-body-container">
+<p><em>maximum of 100 characters</em>.</p>
+</div>
+</div>
+</div>
+</section>
+</section>
+<section id="sec-the-filename" class="level1" data-number="4">
+<h1 data-number="4"><span class="header-section-number">4</span> The Filename</h1>
+<p>The following section will describe the rules and the constrains for the creation of a filename.</p>
+<section id="sec-filename---allowed-characters" class="level2" data-number="4.1">
+<h2 data-number="4.1" class="anchored" data-anchor-id="sec-filename---allowed-characters"><span class="header-section-number">4.1</span> Filename - ASCII characters</h2>
+<p>Filenames are composed by ASCII<a href="#fn3" class="footnote-ref" id="fnref3" role="doc-noteref"><sup>3</sup></a> characters. To ensure maximum interoperability, the following rules are implemented:</p>
+<ul>
+<li><p><strong>Alphanumeric</strong> (letters A-Z, ASCII 97-122, and numbers 0-9, ASCII 48-57): are allowed, whereby they are not handled as case sensitive.</p></li>
+<li><p><strong>Underscore</strong> (‘_’, ASCII 95): exclusively used for separating the different file naming fields (see xxx).</p></li>
+<li><p><strong>Hyphen</strong> (’-’, ASCII 45): exclusively used as separator within fields, between the field and the suffix.</p></li>
+<li><p><strong>Period</strong> (‘.’, ASCII 46): are not allowed. <em><strong>Note</strong>: the period is part of the file extension and there used to separate the filename from the file extension.</em></p></li>
+<li><p><strong>Space</strong> (‘ ’, ASCII 32): are not allowed.</p></li>
+</ul>
+</section>
+<section id="sec-fields---the-elements-of-a-filename" class="level2" data-number="4.2">
+<h2 data-number="4.2" class="anchored" data-anchor-id="sec-fields---the-elements-of-a-filename"><span class="header-section-number">4.2</span> Fields - the elements of a filename</h2>
+<p><strong>Fields</strong> are standardized descriptive elements composing a filename. The filename is made of at least one but typically more than one field. Fields are separated by the field delimiter character which is underscore (‘_’). Each field can have zero, one, or more <strong>-suffixes</strong>. Suffixes are connected to the relevant field by a hyphen (‘-’) and must be placed after the field. A field suffix can be used to describe e.g.,</p>
+<ul>
+<li><p>objects which exist only in relation with another object, e.g.&nbsp;the parameter ‘<em>NDVI</em>’ with the to it associated quality assessment parameter ‘<em>NDVI-QA</em>’.</p></li>
+<li><p>product derivatives e.g., ‘<em>SWI-030</em>’, ‘<em>SWI-040</em>’</p></li>
+</ul>
+<p>Fields may not be empty so that a field delimiter may not occur at the beginning or the end of a filename or twice in a row.</p>
+<div class="callout callout-style-simple callout-note no-icon">
+<div class="callout-body d-flex">
+<div class="callout-icon-container">
+<i class="callout-icon no-icon"></i>
+</div>
+<div class="callout-body-container">
+<p><em>field1_field2_field3-suffix_filed4</em></p>
+</div>
+</div>
+</div>
+</section>
+<section id="sec-filename---structural-rules" class="level2" data-number="4.3">
+<h2 data-number="4.3" class="anchored" data-anchor-id="sec-filename---structural-rules"><span class="header-section-number">4.3</span> Filename - structural rules</h2>
+<p>The arrangement of <em>fields</em> is an essential part in the creation of a filename. The filename structure follows a predefined hierarchical order to ensure efficiency, consistency, and (machine) readability. A <em>filename</em> is subdivided in several <strong>categories</strong>, each of them addressing different purposes, and each of them composed of distinct and unique <strong>fields</strong> and eventually <strong>field-suffixes</strong>.</p>
+<p>Each product or product group can have different composition schemata, i.e., using different categories or fields. The order of categories must be preserved, and the ’<em>main’</em> category <em>(see</em> <a href="#sec-filename-category-main" class="quarto-xref">Section&nbsp;4.3.1</a>), are compulsory.</p>
+<section id="sec-filename-category-main" class="level3" data-number="4.3.1">
+<h3 data-number="4.3.1" class="anchored" data-anchor-id="sec-filename-category-main"><span class="header-section-number">4.3.1</span> Filename category ‘Main’</h3>
+<p>The first category in a filename is called ‘main’ and shall ensure a it can unequivocally associated to a naming scheme. The naming scheme is a set of product or product groups specific rules for the composition of the filename. The <em>main</em> category must contain the necessary information to point to a given scheme. Each schema can have sub schemata.</p>
+</section>
+<section id="sec-filename-category-spatial-identifier" class="level3" data-number="4.3.2">
+<h3 data-number="4.3.2" class="anchored" data-anchor-id="sec-filename-category-spatial-identifier"><span class="header-section-number">4.3.2</span> Filename category ‘Spatial’ identifier</h3>
+<p>2 main spatial identifiers: AOI of the product, and AoI of the scene or image,</p>
+</section>
+<section id="sec-filename-category-temporal-identifier" class="level3" data-number="4.3.3">
+<h3 data-number="4.3.3" class="anchored" data-anchor-id="sec-filename-category-temporal-identifier"><span class="header-section-number">4.3.3</span> Filename category ‘Temporal’ identifier</h3>
+<p>Describes product specific elements such as acquisition or reference date, in case of composite images the composting period,….</p>
+</section>
+<section id="sec-filename-category-production-identifier" class="level3" data-number="4.3.4">
+<h3 data-number="4.3.4" class="anchored" data-anchor-id="sec-filename-category-production-identifier"><span class="header-section-number">4.3.4</span> Filename category ‘Production’ identifier</h3>
+<p>Describes file specific details on the production process and information provenance. Such as file version, processing mode, origin data, processing date…</p>
+</section>
+<section id="sec-filename-category-parameter-identifier" class="level3" data-number="4.3.5">
+<h3 data-number="4.3.5" class="anchored" data-anchor-id="sec-filename-category-parameter-identifier"><span class="header-section-number">4.3.5</span> Filename category ‘Parameter’ identifier</h3>
+<p>The category ‘Parameter identifier’ is the last field in a filename, and antecedent to the file extension. This category is reserved to layer level information of a file. <!--# we have to define what a product, coverage, layer etc are or at least how we do understand them --></p>
+<div id="tbl-captable-of-categories-and-fields" class="striped quarto-float quarto-figure quarto-figure-center anchored">
+<figure class="quarto-float quarto-float-tbl figure">
+<figcaption class="quarto-float-caption-top quarto-float-caption quarto-float-tbl" id="tbl-captable-of-categories-and-fields-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
+Table&nbsp;1: Categories and fields of a filename
+</figcaption>
+<div aria-describedby="tbl-captable-of-categories-and-fields-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
+<table class="table-striped caption-top table">
+<thead>
+<tr class="header">
+<th style="text-align: left;">Category</th>
+<th style="text-align: left;">Field</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td style="text-align: left;">Main</td>
+<td style="text-align: left;">Producing entity</td>
+</tr>
+<tr class="even">
+<td style="text-align: left;">Main</td>
+<td style="text-align: left;">Theme</td>
+</tr>
+<tr class="odd">
+<td style="text-align: left;">Spatial identifier</td>
+<td style="text-align: left;">Resolution</td>
+</tr>
+<tr class="even">
+<td style="text-align: left;">Spatial identifier</td>
+<td style="text-align: left;">Tile</td>
+</tr>
+<tr class="odd">
+<td style="text-align: left;">Spatial identifier</td>
+<td style="text-align: left;">Coverage</td>
+</tr>
+<tr class="even">
+<td style="text-align: left;">Temporal identifier</td>
+<td style="text-align: left;">(Acquisition) Date</td>
+</tr>
+<tr class="odd">
+<td style="text-align: left;">Production identifier</td>
+<td style="text-align: left;">Platform</td>
+</tr>
+<tr class="even">
+<td style="text-align: left;">Production identifier</td>
+<td style="text-align: left;">Version</td>
+</tr>
+<tr class="odd">
+<td style="text-align: left;">Production identifier</td>
+<td style="text-align: left;">Processing date</td>
+</tr>
+<tr class="even">
+<td style="text-align: left;">Parameter identifier</td>
+<td style="text-align: left;">Product</td>
+</tr>
+<tr class="odd">
+<td style="text-align: left;">Parameter identifier</td>
+<td style="text-align: left;">Parameter</td>
+</tr>
+</tbody>
+</table>
+</div>
+</figure>
+</div>
+<!--# to add link to the reference if the fields (table) -->
+<dl>
+<dt><img src="images/mermaid-diagram-20240913161555.png" class="img-fluid" alt="Example decision tree"></dt>
+<dd>
+<p>Example decision tree {#fig-cap:classification tree}<!--# pdf and doc cannot render mermaid graphs, this provisional solution --></p>
+</dd>
+</dl>
+</section>
+</section>
+</section>
+
+
+<div id="quarto-appendix" class="default"><section id="footnotes" class="footnotes footnotes-end-of-document" role="doc-endnotes"><h2 class="anchored quarto-appendix-heading">Footnotes</h2>
+
+<ol>
+<li id="fn1"><p><a href="https://en.wikipedia.org/wiki/Fully_qualified_name" class="uri">https://en.wikipedia.org/wiki/Fully_qualified_name</a><a href="#fnref1" class="footnote-back" role="doc-backlink">↩︎</a></p></li>
+<li id="fn2"><p>NTFS supports approximately 32,767 characters. For compatibility reasons the limit imposed by the Windows 32bit API has been chosen.<a href="#fnref2" class="footnote-back" role="doc-backlink">↩︎</a></p></li>
+<li id="fn3"><p><a href="https://en.wikipedia.org/wiki/ASCII" class="uri">https://en.wikipedia.org/wiki/ASCII</a><a href="#fnref3" class="footnote-back" role="doc-backlink">↩︎</a></p></li>
+</ol>
+</section></div></main>
+<!-- /main column -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    // For code content inside modals, clipBoardJS needs to be initialized with a container option
+    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp('/' + window.location.host + '/');
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      if (note) {
+        return note.innerHTML;
+      } else {
+        return "";
+      }
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/CheatSheet.html b/CheatSheet.html
new file mode 100644
index 0000000..3efd9eb
--- /dev/null
+++ b/CheatSheet.html
@@ -0,0 +1,800 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.5.56">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+<meta name="author" content="Ayan Chatterjee, Department of DIGITAL, NILU">
+<meta name="dcterms.date" content="2024-10-29">
+<meta name="keywords" content="AI standards, web crawlers, AI training, content formatting">
+
+<title>A Cheatsheet for Developing Standards for Generative AI Training and Web Crawlers</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+
+<script src="CheatSheet_files/libs/clipboard/clipboard.min.js"></script>
+<script src="CheatSheet_files/libs/quarto-html/quarto.js"></script>
+<script src="CheatSheet_files/libs/quarto-html/popper.min.js"></script>
+<script src="CheatSheet_files/libs/quarto-html/tippy.umd.min.js"></script>
+<script src="CheatSheet_files/libs/quarto-html/anchor.min.js"></script>
+<link href="CheatSheet_files/libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="CheatSheet_files/libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="CheatSheet_files/libs/bootstrap/bootstrap.min.js"></script>
+<link href="CheatSheet_files/libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="CheatSheet_files/libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+
+
+</head>
+
+<body>
+
+<div id="quarto-content" class="page-columns page-rows-contents page-layout-article">
+<div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+  <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">Contents</h2>
+   
+  <ul>
+  <li><a href="#introduction" id="toc-introduction" class="nav-link active" data-scroll-target="#introduction">1. Introduction</a>
+  <ul class="collapse">
+  <li><a href="#importance-of-structured-data-for-ai-and-web-crawlers" id="toc-importance-of-structured-data-for-ai-and-web-crawlers" class="nav-link" data-scroll-target="#importance-of-structured-data-for-ai-and-web-crawlers">1.1. Importance of Structured Data for AI and Web Crawlers</a></li>
+  <li><a href="#goals-of-content-standardization" id="toc-goals-of-content-standardization" class="nav-link" data-scroll-target="#goals-of-content-standardization">1.2. Goals of Content Standardization</a></li>
+  <li><a href="#benefits-of-sitemaps-and-metadata" id="toc-benefits-of-sitemaps-and-metadata" class="nav-link" data-scroll-target="#benefits-of-sitemaps-and-metadata">1.3. Benefits of Sitemaps and Metadata</a></li>
+  </ul></li>
+  <li><a href="#content-standards-for-ai-and-web-crawlers" id="toc-content-standards-for-ai-and-web-crawlers" class="nav-link" data-scroll-target="#content-standards-for-ai-and-web-crawlers">2. Content Standards for AI and Web Crawlers</a>
+  <ul class="collapse">
+  <li><a href="#content-structuring-in-quarto-markdown" id="toc-content-structuring-in-quarto-markdown" class="nav-link" data-scroll-target="#content-structuring-in-quarto-markdown">2.1. Content Structuring in Quarto Markdown</a>
+  <ul class="collapse">
+  <li><a href="#yaml-example-for-metadata" id="toc-yaml-example-for-metadata" class="nav-link" data-scroll-target="#yaml-example-for-metadata">YAML Example for Metadata</a></li>
+  </ul></li>
+  <li><a href="#html-structuring-for-web-crawlers" id="toc-html-structuring-for-web-crawlers" class="nav-link" data-scroll-target="#html-structuring-for-web-crawlers">2.2. HTML Structuring for Web Crawlers</a>
+  <ul class="collapse">
+  <li><a href="#microdata-for-structured-content" id="toc-microdata-for-structured-content" class="nav-link" data-scroll-target="#microdata-for-structured-content">2.2.1. Microdata for Structured Content</a></li>
+  </ul></li>
+  <li><a href="#pdf-structuring-for-ai-integration" id="toc-pdf-structuring-for-ai-integration" class="nav-link" data-scroll-target="#pdf-structuring-for-ai-integration">2.3. PDF Structuring for AI Integration</a></li>
+  <li><a href="#html-structuring-for-ai-integration" id="toc-html-structuring-for-ai-integration" class="nav-link" data-scroll-target="#html-structuring-for-ai-integration">2.4. HTML Structuring for AI Integration</a></li>
+  </ul></li>
+  <li><a href="#importance-of-sitemap-indexing-in-html-documents" id="toc-importance-of-sitemap-indexing-in-html-documents" class="nav-link" data-scroll-target="#importance-of-sitemap-indexing-in-html-documents">3. Importance of Sitemap Indexing in HTML Documents</a></li>
+  <li><a href="#best-practices-for-information-formatting" id="toc-best-practices-for-information-formatting" class="nav-link" data-scroll-target="#best-practices-for-information-formatting">4. Best Practices for Information Formatting</a></li>
+  <li><a href="#quarto-markdown-editors" id="toc-quarto-markdown-editors" class="nav-link" data-scroll-target="#quarto-markdown-editors">5. Quarto Markdown Editors</a>
+  <ul class="collapse">
+  <li><a href="#steps-to-set-it-up" id="toc-steps-to-set-it-up" class="nav-link" data-scroll-target="#steps-to-set-it-up">Steps to Set It Up</a></li>
+  <li><a href="#benefits" id="toc-benefits" class="nav-link" data-scroll-target="#benefits">Benefits</a></li>
+  </ul></li>
+  <li><a href="#automation-with-github-deployment" id="toc-automation-with-github-deployment" class="nav-link" data-scroll-target="#automation-with-github-deployment">6. Automation with GitHub Deployment</a></li>
+  <li><a href="#conclusion" id="toc-conclusion" class="nav-link" data-scroll-target="#conclusion">6. Conclusion</a></li>
+  </ul>
+</nav>
+</div>
+<main class="content" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default">
+<div class="quarto-title">
+<h1 class="title">A Cheatsheet for Developing Standards for Generative AI Training and Web Crawlers</h1>
+<p class="subtitle lead">Information Provisioning for Generative Chatbots</p>
+</div>
+
+
+
+<div class="quarto-title-meta">
+
+    <div>
+    <div class="quarto-title-meta-heading">Author</div>
+    <div class="quarto-title-meta-contents">
+             <p>Ayan Chatterjee, Department of DIGITAL, NILU </p>
+          </div>
+  </div>
+    
+    <div>
+    <div class="quarto-title-meta-heading">Published</div>
+    <div class="quarto-title-meta-contents">
+      <p class="date">October 29, 2024</p>
+    </div>
+  </div>
+  
+    
+  </div>
+  
+
+<div>
+  <div class="keywords">
+    <div class="block-title">Keywords</div>
+    <p>AI standards, web crawlers, AI training, content formatting</p>
+  </div>
+</div>
+
+</header>
+
+
+<p>This document serve as a quick reference guide to ensure content follows structured formats essential for web crawlers and AI systems. Utilizing Quarto Markdown in HTMLs and generating sitemaps are critical for efficient crawling, helping search engines and AI models quickly index and retrieve well-structured content.</p>
+<section id="introduction" class="level1">
+<h1>1. Introduction</h1>
+<section id="importance-of-structured-data-for-ai-and-web-crawlers" class="level2">
+<h2 class="anchored" data-anchor-id="importance-of-structured-data-for-ai-and-web-crawlers">1.1. Importance of Structured Data for AI and Web Crawlers</h2>
+<p>Generative AI and chatbots rely heavily on structured data to provide meaningful and accurate responses. For these systems to operate efficiently, they need access to data that is easy to index, retrieve, and process. Properly formatted content enables web crawlers and AI models to efficiently access and retrieve data, improving the accuracy of results provided to users.</p>
+<p>Web crawlers, also known as bots or spiders, index web content by following hyperlinks. They require well-structured content, often formatted in HTML, with clear metadata to ensure content is discoverable and up-to-date for search engines and AI systems.</p>
+<hr>
+</section>
+<section id="goals-of-content-standardization" class="level2">
+<h2 class="anchored" data-anchor-id="goals-of-content-standardization">1.2. Goals of Content Standardization</h2>
+<ul>
+<li><strong>Improved Data Access</strong>: Ensuring web crawlers and AI models can easily access structured data.</li>
+<li><strong>Enhanced Search Engine Optimization (SEO)</strong>: Well-formatted content improves visibility and accessibility across search engines.</li>
+<li><strong>Better AI Model Training</strong>: Consistent data structure helps in training models more effectively.</li>
+<li><strong>Faster Retrieval</strong>: Structured content enables quicker retrieval of relevant information, especially in time-sensitive applications.</li>
+</ul>
+<hr>
+</section>
+<section id="benefits-of-sitemaps-and-metadata" class="level2">
+<h2 class="anchored" data-anchor-id="benefits-of-sitemaps-and-metadata">1.3. Benefits of Sitemaps and Metadata</h2>
+<ul>
+<li><strong>Sitemaps</strong>: Provide a roadmap for web crawlers to discover all content. A well-structured sitemap enhances a crawler’s efficiency, ensuring that content is indexed properly.</li>
+<li><strong>Metadata</strong>: Metadata improves the discoverability and accuracy of content retrieval. Metadata tags such as title, author, date, and description help crawlers and AI models understand the content’s structure and relevance.</li>
+</ul>
+<hr>
+</section>
+</section>
+<section id="content-standards-for-ai-and-web-crawlers" class="level1">
+<h1>2. Content Standards for AI and Web Crawlers</h1>
+<section id="content-structuring-in-quarto-markdown" class="level2">
+<h2 class="anchored" data-anchor-id="content-structuring-in-quarto-markdown">2.1. Content Structuring in Quarto Markdown</h2>
+<p>Quarto Markdown provides an efficient way to structure content for generative AI and web crawlers. Use clear headings, subheadings, and metadata to help web crawlers navigate the content.</p>
+<section id="yaml-example-for-metadata" class="level3">
+<h3 class="anchored" data-anchor-id="yaml-example-for-metadata">YAML Example for Metadata</h3>
+<div class="sourceCode" id="cb1"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="fu">title</span><span class="kw">:</span><span class="at"> </span><span class="st">"AI and Web Crawling Standards"</span></span>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="fu">author</span><span class="kw">:</span><span class="at"> </span><span class="st">"Your Name"</span></span>
+<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="fu">date</span><span class="kw">:</span><span class="at"> </span><span class="st">"2024-09-30"</span></span>
+<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="fu">keywords</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"AI standards"</span><span class="kw">,</span><span class="at"> </span><span class="st">"web crawlers"</span><span class="kw">,</span><span class="at"> </span><span class="st">"metadata"</span><span class="kw">]</span></span>
+<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="fu">sitemap</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
+<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<hr>
+</section>
+</section>
+<section id="html-structuring-for-web-crawlers" class="level2">
+<h2 class="anchored" data-anchor-id="html-structuring-for-web-crawlers">2.2. HTML Structuring for Web Crawlers</h2>
+<p>Semantic HTML5 elements, such as <code>&lt;article&gt;</code>, <code>&lt;section&gt;</code>, and <code>&lt;header&gt;</code>, help web crawlers index and understand the content more efficiently.</p>
+<div class="sourceCode" id="cb2"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span>
+<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="at">&lt;article&gt;</span></span>
+<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a><span class="at">  &lt;header&gt;</span></span>
+<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a><span class="at">    &lt;h1&gt;Understanding Web Crawlers&lt;/h1&gt;</span></span>
+<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a><span class="at">    &lt;meta name="description" content="Overview of web crawlers and their role in AI training." /&gt;</span></span>
+<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a><span class="at">  &lt;/header&gt;</span></span>
+<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a><span class="at">  &lt;section&gt;</span></span>
+<span id="cb2-8"><a href="#cb2-8" aria-hidden="true" tabindex="-1"></a><span class="at">    &lt;h2&gt;How Web Crawlers Index Content&lt;/h2&gt;</span></span>
+<span id="cb2-9"><a href="#cb2-9" aria-hidden="true" tabindex="-1"></a><span class="at">    &lt;p&gt;Web crawlers use links and metadata to index the web.&lt;/p&gt;</span></span>
+<span id="cb2-10"><a href="#cb2-10" aria-hidden="true" tabindex="-1"></a><span class="at">  &lt;/section&gt;</span></span>
+<span id="cb2-11"><a href="#cb2-11" aria-hidden="true" tabindex="-1"></a><span class="at">&lt;/article&gt;</span></span>
+<span id="cb2-12"><a href="#cb2-12" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<section id="microdata-for-structured-content" class="level3">
+<h3 class="anchored" data-anchor-id="microdata-for-structured-content">2.2.1. Microdata for Structured Content</h3>
+<div class="sourceCode" id="cb3"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span>
+<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="at">&lt;article itemscope itemtype="https://schema.org/Article"&gt;</span></span>
+<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a><span class="at">  &lt;header&gt;</span></span>
+<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a><span class="at">    &lt;h1 itemprop="headline"&gt;AI and Web Crawling&lt;/h1&gt;</span></span>
+<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a><span class="at">    &lt;meta itemprop="description" content="Overview of AI training using web crawlers." /&gt;</span></span>
+<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a><span class="at">  &lt;/header&gt;</span></span>
+<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a><span class="at">&lt;/article&gt;</span></span>
+<span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<hr>
+</section>
+</section>
+<section id="pdf-structuring-for-ai-integration" class="level2">
+<h2 class="anchored" data-anchor-id="pdf-structuring-for-ai-integration">2.3. PDF Structuring for AI Integration</h2>
+<p>For documents in PDF format, ensure proper tagging of sections and headings to improve readability and indexing by crawlers and AI models. Add relevant metadata to the document properties.</p>
+<div class="sourceCode" id="cb4"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span>
+<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a><span class="fu">title</span><span class="kw">:</span><span class="at"> </span><span class="st">"Structured PDF for AI"</span></span>
+<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a><span class="fu">author</span><span class="kw">:</span><span class="at"> </span><span class="st">"Your Name"</span></span>
+<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a><span class="fu">keywords</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"AI"</span><span class="kw">,</span><span class="at"> </span><span class="st">"web crawlers"</span><span class="kw">,</span><span class="at"> </span><span class="st">"PDF"</span><span class="kw">]</span></span>
+<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<hr>
+</section>
+<section id="html-structuring-for-ai-integration" class="level2">
+<h2 class="anchored" data-anchor-id="html-structuring-for-ai-integration">2.4. HTML Structuring for AI Integration</h2>
+<p>To optimize content for AI integration, HTML documents should include semantic elements, structured data formats like JSON-LD, and relevant metadata. This helps AI systems process and train on the content efficiently.</p>
+<div class="sourceCode" id="cb5"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span>
+<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a><span class="at">&lt;article itemscope itemtype="https://schema.org/Article"&gt;</span></span>
+<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a><span class="at">  &lt;header&gt;</span></span>
+<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a><span class="at">    &lt;h1 itemprop="headline"&gt;AI Training Data and Web Crawlers&lt;/h1&gt;</span></span>
+<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a><span class="at">    &lt;meta name="description" content="How to structure content for AI training and web crawling." /&gt;</span></span>
+<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a><span class="at">  &lt;/header&gt;</span></span>
+<span id="cb5-7"><a href="#cb5-7" aria-hidden="true" tabindex="-1"></a><span class="at">  &lt;section&gt;</span></span>
+<span id="cb5-8"><a href="#cb5-8" aria-hidden="true" tabindex="-1"></a><span class="at">    &lt;h2&gt;AI Model Training&lt;/h2&gt;</span></span>
+<span id="cb5-9"><a href="#cb5-9" aria-hidden="true" tabindex="-1"></a><span class="at">    &lt;p&gt;Semantic structure is essential for AI to understand content.&lt;/p&gt;</span></span>
+<span id="cb5-10"><a href="#cb5-10" aria-hidden="true" tabindex="-1"></a><span class="at">    &lt;script type="application/ld+json"&gt;</span></span>
+<span id="cb5-11"><a href="#cb5-11" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="kw">{</span></span>
+<span id="cb5-12"><a href="#cb5-12" aria-hidden="true" tabindex="-1"></a><span class="at">      </span><span class="fu">"@context"</span><span class="kw">:</span><span class="at"> </span><span class="st">"https://schema.org"</span><span class="kw">,</span></span>
+<span id="cb5-13"><a href="#cb5-13" aria-hidden="true" tabindex="-1"></a><span class="at">      </span><span class="fu">"@type"</span><span class="kw">:</span><span class="at"> </span><span class="st">"Dataset"</span><span class="kw">,</span></span>
+<span id="cb5-14"><a href="#cb5-14" aria-hidden="true" tabindex="-1"></a><span class="at">      </span><span class="fu">"name"</span><span class="kw">:</span><span class="at"> </span><span class="st">"AI Training Data"</span><span class="kw">,</span></span>
+<span id="cb5-15"><a href="#cb5-15" aria-hidden="true" tabindex="-1"></a><span class="at">      </span><span class="fu">"description"</span><span class="kw">:</span><span class="at"> </span><span class="st">"Dataset structured for AI and web crawlers."</span><span class="kw">,</span></span>
+<span id="cb5-16"><a href="#cb5-16" aria-hidden="true" tabindex="-1"></a><span class="at">      </span><span class="fu">"creator"</span><span class="kw">:</span><span class="at"> </span><span class="kw">{</span></span>
+<span id="cb5-17"><a href="#cb5-17" aria-hidden="true" tabindex="-1"></a><span class="at">        </span><span class="fu">"@type"</span><span class="kw">:</span><span class="at"> </span><span class="st">"Organization"</span><span class="kw">,</span></span>
+<span id="cb5-18"><a href="#cb5-18" aria-hidden="true" tabindex="-1"></a><span class="at">        </span><span class="fu">"name"</span><span class="kw">:</span><span class="at"> </span><span class="st">"Your Organization"</span></span>
+<span id="cb5-19"><a href="#cb5-19" aria-hidden="true" tabindex="-1"></a><span class="at">      </span><span class="kw">}</span></span>
+<span id="cb5-20"><a href="#cb5-20" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="kw">}</span></span>
+<span id="cb5-21"><a href="#cb5-21" aria-hidden="true" tabindex="-1"></a><span class="at">    &lt;/script&gt;</span></span>
+<span id="cb5-22"><a href="#cb5-22" aria-hidden="true" tabindex="-1"></a><span class="at">  &lt;/section&gt;</span></span>
+<span id="cb5-23"><a href="#cb5-23" aria-hidden="true" tabindex="-1"></a><span class="at">&lt;/article&gt;</span></span>
+<span id="cb5-24"><a href="#cb5-24" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<hr>
+</section>
+</section>
+<section id="importance-of-sitemap-indexing-in-html-documents" class="level1">
+<h1>3. Importance of Sitemap Indexing in HTML Documents</h1>
+<p>Sitemaps are essential for enhancing the discoverability and accessibility of web content for both web crawlers and AI systems. As an XML file, a sitemap provides a structured roadmap of a website, listing URLs, metadata, and details like last modified dates and update frequency. This helps crawlers efficiently index content and enables generative AI models to train on well-structured data, improving processing and retrieval accuracy. Key Benefits of Sitemap Indexing for Web Crawling and AI Training are:</p>
+<ul>
+<li><p><strong>Improved Discoverability</strong>: Sitemaps enable web crawlers to find all relevant resources on a site, especially for deep or hard-to-reach pages.</p></li>
+<li><p><strong>Efficient Crawling</strong>: Crawlers can prioritize content based on metadata like the last updated date, making re-indexing more effective.</p></li>
+<li><p><strong>Structured Data for AI Training</strong>: Well-indexed documents help generative AI models understand relationships between content, improving relevance and accuracy in AI-generated responses.</p></li>
+<li><p><strong>Faster Content Retrieval</strong>: Sitemaps speed up indexing and ensure better search rankings, enabling faster content access for AI models.</p></li>
+</ul>
+<div class="sourceCode" id="cb6"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span>
+<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a><span class="at">&lt;urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"&gt;</span></span>
+<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a><span class="at">&lt;url&gt;</span></span>
+<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a><span class="at">    &lt;loc&gt;https://&lt;your-username&gt;.github.io/&lt;your-repo-name&gt;/index.html&lt;/loc&gt;</span></span>
+<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a><span class="at">    &lt;lastmod&gt;2024-10-08T12:24:05Z&lt;/lastmod&gt;</span></span>
+<span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a><span class="at">    &lt;changefreq&gt;monthly&lt;/changefreq&gt;</span></span>
+<span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a><span class="at">    &lt;priority&gt;0.8&lt;/priority&gt;</span></span>
+<span id="cb6-8"><a href="#cb6-8" aria-hidden="true" tabindex="-1"></a><span class="at">&lt;/url&gt;</span></span>
+<span id="cb6-9"><a href="#cb6-9" aria-hidden="true" tabindex="-1"></a><span class="at">&lt;/urlset&gt;</span></span>
+<span id="cb6-10"><a href="#cb6-10" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Submit your sitemap to search engines via tools like Google Search Console to ensure your content is indexed properly. This improves the discoverability of AI training datasets and documents by web crawlers and AI models.</p>
+<hr>
+</section>
+<section id="best-practices-for-information-formatting" class="level1">
+<h1>4. Best Practices for Information Formatting</h1>
+<ul>
+<li><p><strong>Consistent Metadata:</strong> Use uniform metadata (title, author, description, keywords) across all documents.</p></li>
+<li><p><strong>Structured Headings:</strong> Organize content using headings and subheadings for easy navigation by both users and web crawlers.</p></li>
+<li><p><strong>Cross-references</strong>: Link to related content to improve discoverability and create a cohesive data ecosystem.</p></li>
+<li><p><strong>Clear Language:</strong> Use concise, non-technical language to ensure that both users and machines can understand the content.</p></li>
+</ul>
+<hr>
+</section>
+<section id="quarto-markdown-editors" class="level1">
+<h1>5. Quarto Markdown Editors</h1>
+<p>To work with Quarto Markdown (.qmd) files and have them generated automatically, we can use several editors that integrate well with Quarto. VS Code (Visual Studio Code), RStudio, JupyterLab with Quarto Integration, and Atom with Quarto Plugin are some popular editors that support Quarto and can automatically generate .qmd files.</p>
+<p>R-Studio is lightweight, easy-to-use and integrates with Quarto and provides tools for rendering, previewing, and managing .qmd documents in an effective way.</p>
+<section id="steps-to-set-it-up" class="level2">
+<h2 class="anchored" data-anchor-id="steps-to-set-it-up">Steps to Set It Up</h2>
+<ol type="1">
+<li><strong>Install RStudio</strong>: Download from <a href="https://posit.co/download/rstudio-desktop/">RStudio</a>.</li>
+<li><strong>Install Quarto</strong>: Follow <a href="https://quarto.org/docs/get-started/">Quarto installation</a> instructions to install Quarto.</li>
+<li><strong>Create a New Quarto Document</strong>:
+<ul>
+<li>In RStudio, go to <strong>File &gt; New File &gt; Quarto Document</strong>.</li>
+<li>Choose the type of document you want (e.g., HTML, PDF, Word).</li>
+<li>A <code>.qmd</code> file will be created automatically.</li>
+</ul></li>
+<li><strong>Automatically Render <code>.qmd</code></strong>:
+<ul>
+<li>After editing your document, you can preview it using <strong>Render</strong> or export it to various formats.</li>
+</ul></li>
+</ol>
+</section>
+<section id="benefits" class="level2">
+<h2 class="anchored" data-anchor-id="benefits">Benefits</h2>
+<ul>
+<li>Full support for Quarto with an integrated environment.</li>
+<li>Provides tools for live preview and exporting.</li>
+<li>Ideal for users familiar with R or data science workflows.</li>
+</ul>
+<hr>
+</section>
+</section>
+<section id="automation-with-github-deployment" class="level1">
+<h1>6. Automation with GitHub Deployment</h1>
+<p>Automation is crucial for ensuring efficiency and consistency in the deployment of content structured for AI integration and web crawlers. By automating the rendering of Quarto Markdown, Markdown, and Jupyter Notebook files into HTML, generating a sitemap, and deploying the output to GitHub Pages, the process becomes seamless and repeatable with minimal human intervention. This ensures that any changes to content are instantly reflected on the website, keeping the content discoverable and up-to-date for web crawlers and AI systems. Steps in the Automation Pipeline are:</p>
+<ol type="a">
+<li><strong>Trigger on Push or Pull Requests</strong>:
+<ul>
+<li>The workflow is triggered whenever <code>.qmd</code> files are modified or included in a pull request, ensuring content is updated automatically.</li>
+</ul></li>
+<li><strong>Checkout Repository</strong>:
+<ul>
+<li>Retrieves the latest version of the repository where content resides.</li>
+</ul></li>
+<li><strong>Install Quarto</strong>:
+<ul>
+<li>Installs the necessary Quarto CLI to render files into HTML.</li>
+</ul></li>
+<li><strong>Render Content</strong>:
+<ul>
+<li>Converts Quarto Markdown, Markdown, and Jupyter Notebook files into HTML format for web deployment.</li>
+</ul></li>
+<li><strong>Move Generated HTML to Deployment Folder</strong>:
+<ul>
+<li>Organizes all generated HTML files into the designated folder (<code>docs</code>) for web deployment.</li>
+</ul></li>
+<li><strong>Generate Sitemap</strong>:
+<ul>
+<li>Automatically creates a <code>sitemap.xml</code> following the google structure and it helps search engines and web crawlers discover all available content on the website.</li>
+</ul></li>
+<li><strong>Deploy to GitHub Pages</strong>:
+<ul>
+<li>Deploys the <code>docs</code> folder, which contains the HTML and <code>sitemap.xml</code>, to GitHub Pages for public access.</li>
+</ul></li>
+</ol>
+<hr>
+</section>
+<section id="conclusion" class="level1">
+<h1>6. Conclusion</h1>
+<p>Standardizing content formatting using Quarto Markdown, HTML5, and sitemaps is essential for enabling effective web crawling and AI training. Structured data ensures improved discoverability, faster indexing, and better accessibility, supporting the development of more accurate and responsive AI models.</p>
+<hr>
+</section>
+
+</main>
+<!-- /main column -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    // For code content inside modals, clipBoardJS needs to be initialized with a container option
+    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp('/' + window.location.host + '/');
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      if (note) {
+        return note.innerHTML;
+      } else {
+        return "";
+      }
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/README.html b/README.html
new file mode 100644
index 0000000..8d0fede
--- /dev/null
+++ b/README.html
@@ -0,0 +1,292 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.3.340">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+
+<title>readme</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+</style>
+
+
+<script src="README_files/libs/clipboard/clipboard.min.js"></script>
+<script src="README_files/libs/quarto-html/quarto.js"></script>
+<script src="README_files/libs/quarto-html/popper.min.js"></script>
+<script src="README_files/libs/quarto-html/tippy.umd.min.js"></script>
+<script src="README_files/libs/quarto-html/anchor.min.js"></script>
+<link href="README_files/libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="README_files/libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="README_files/libs/bootstrap/bootstrap.min.js"></script>
+<link href="README_files/libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="README_files/libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+
+
+</head>
+
+<body class="fullcontent">
+
+<div id="quarto-content" class="page-columns page-rows-contents page-layout-article">
+
+<main class="content" id="quarto-document-content">
+
+
+
+<section id="copernicus-land-monitoring-service-clms" class="level1">
+<h1>Copernicus Land Monitoring Service (CLMS)</h1>
+<p>This repository contains technical documents for the CLMS, such as ATBD’s, PUM’s, or nomenclature guidelines.</p>
+</section>
+
+</main>
+<!-- /main column -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button', {
+    text: function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+    }
+  });
+  clipboard.on('success', function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  });
+  function tippyHover(el, contentFn) {
+    const config = {
+      allowHTML: true,
+      content: contentFn,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start'
+    };
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      return note.innerHTML;
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+</div> <!-- /content -->
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/clms.html b/clms.html
new file mode 100644
index 0000000..df5a92a
--- /dev/null
+++ b/clms.html
@@ -0,0 +1,1227 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.5.56">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+<meta name="author" content="Ayan Chatterjee, Department of DIGITAL, NILU">
+<meta name="dcterms.date" content="2024-09-16">
+<meta name="keywords" content="CLMS standards, web crawlers, AI training, information formatting">
+
+<title>Developing CLMS Standards for Generative AI Training and Web Crawlers Using Quarto Markdown and Sitemaps</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+/* CSS for citations */
+div.csl-bib-body { }
+div.csl-entry {
+  clear: both;
+  margin-bottom: 0em;
+}
+.hanging-indent div.csl-entry {
+  margin-left:2em;
+  text-indent:-2em;
+}
+div.csl-left-margin {
+  min-width:2em;
+  float:left;
+}
+div.csl-right-inline {
+  margin-left:2em;
+  padding-left:1em;
+}
+div.csl-indent {
+  margin-left: 2em;
+}</style>
+
+
+<script src="clms_files/libs/clipboard/clipboard.min.js"></script>
+<script src="clms_files/libs/quarto-html/quarto.js"></script>
+<script src="clms_files/libs/quarto-html/popper.min.js"></script>
+<script src="clms_files/libs/quarto-html/tippy.umd.min.js"></script>
+<script src="clms_files/libs/quarto-html/anchor.min.js"></script>
+<link href="clms_files/libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="clms_files/libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="clms_files/libs/bootstrap/bootstrap.min.js"></script>
+<link href="clms_files/libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="clms_files/libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+
+
+</head>
+
+<body>
+
+<div id="quarto-content" class="page-columns page-rows-contents page-layout-article">
+<div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+  
+<div class="quarto-alternate-formats"><h2>Other Formats</h2><ul><li><a href="clms.pdf"><i class="bi bi-file-pdf"></i>PDF</a></li><li><a href="clms.docx"><i class="bi bi-file-word"></i>MS Word</a></li></ul></div></div>
+<main class="content" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default">
+<div class="quarto-title">
+<h1 class="title">Developing CLMS Standards for Generative AI Training and Web Crawlers Using Quarto Markdown and Sitemaps</h1>
+<p class="subtitle lead">Task 10.1: Information Provisioning for Generative Chatbots</p>
+</div>
+
+
+
+<div class="quarto-title-meta">
+
+    <div>
+    <div class="quarto-title-meta-heading">Author</div>
+    <div class="quarto-title-meta-contents">
+             <p>Ayan Chatterjee, Department of DIGITAL, NILU </p>
+          </div>
+  </div>
+    
+    <div>
+    <div class="quarto-title-meta-heading">Published</div>
+    <div class="quarto-title-meta-contents">
+      <p class="date">September 16, 2024</p>
+    </div>
+  </div>
+  
+    
+  </div>
+  
+
+<div>
+  <div class="keywords">
+    <div class="block-title">Keywords</div>
+    <p>CLMS standards, web crawlers, AI training, information formatting</p>
+  </div>
+</div>
+
+</header>
+
+
+<div style="font-family: 'Times New Roman', serif; text-align: justify;">
+<section id="abstract" class="level1">
+<h1>Abstract</h1>
+<p>Generative chatbots rely on large amounts of structured data to provide accurate, timely responses to user queries. By developing <strong>Copernicus Land Monitoring Service (CLMS)</strong> standards for information formatting and delivery using <strong>Quarto Markdown</strong> and <strong>sitemaps</strong>, we can ensure that the vast amounts of environmental data in CLMS are accessible to web crawlers and AI models. Using standardized structured content improves discoverability and discoverability of CLMS products and makes it easier for users to access relevant datasets through traditional search engines and generative chatbots.</p>
+<p>In addition, by providing clear guidelines for content formatting, cross-referencing, and sitemap management, this approach ensures that the CLMS data repository remains up-to-date and well-organized. This in turn supports the training of AI models to help users find exactly the CLMS products they need, whether through direct query or generative chatbot interaction.</p>
+</section>
+</div>
+<div style="font-family: 'Times New Roman', serif; text-align: justify;">
+<section id="introduction" class="level1">
+<h1>1. Introduction</h1>
+<section id="importance-of-copernicus-land-monitoring-service-clms" class="level2">
+<h2 class="anchored" data-anchor-id="importance-of-copernicus-land-monitoring-service-clms">1.1. Importance of Copernicus Land Monitoring Service (CLMS)</h2>
+<p>The Copernicus Land Monitoring Service (CLMS) is a critical component of the Copernicus Programme, which is the European Union’s Earth observation initiative <span class="citation" data-cites="CLMS"><a href="#ref-CLMS" role="doc-biblioref">[1]</a></span>. The service is responsible for providing timely and accurate land cover and land use data, along with a wide range of environmental variables related to land ecosystems. This data is essential for understanding and managing Europe’s environmental resources, supporting sustainable development, climate monitoring, and informed policy-making. The key areas where CLMS is vital include:</p>
+<ul>
+<li><p><strong>Environmental Monitoring</strong>: CLMS provides data on land cover, vegetation, soil, and water bodies, which are crucial for monitoring environmental changes such as deforestation, urban sprawl, and the health of ecosystems. This data supports conservation efforts and helps in tracking biodiversity and land degradation.</p></li>
+<li><p><strong>Sustainable Land Management</strong>: With the growing need for sustainable practices, CLMS delivers data that helps governments and organizations plan and manage land resources more effectively. It supports agriculture, forestry, water management, and urban planning, helping to mitigate the effects of climate change.</p></li>
+<li><p><strong>Climate Change Monitoring</strong>: CLMS plays a significant role in assessing the impact of climate change on European landscapes. It helps track changes in land use, vegetation, and land surface temperatures, which are important indicators of climate change impacts.</p></li>
+<li><p><strong>Disaster Management</strong>: CLMS data is used for emergency response and disaster management, especially in cases of floods, fires, and other natural disasters. The accurate and near-real-time data allows authorities to take preventive actions and make quick decisions during emergencies.</p></li>
+<li><p><strong>Policy Support and Decision-Making</strong>: The service supports EU environmental policies, including the Green Deal, Common Agricultural Policy (CAP), and the EU Biodiversity Strategy. The data provided by CLMS informs decision-makers at the European, national, and local levels, ensuring that policies are grounded in the latest environmental data.</p></li>
+</ul>
+</section>
+<section id="importance-of-clms-documentation-for-web-crawlers-enhancing-product-discoverability-and-findability" class="level2">
+<h2 class="anchored" data-anchor-id="importance-of-clms-documentation-for-web-crawlers-enhancing-product-discoverability-and-findability">1.2. Importance of CLMS Documentation for Web Crawlers: Enhancing Product Discoverability and Findability</h2>
+<p>The discoverability and findability of CLMS products on the web are crucial for ensuring that this valuable environmental data is accessible to a wide range of users, including researchers, policymakers, and environmental organizations. Making CLMS documentation available on the web for crawlers facilitates product discoverability by enabling search engines and AI-powered systems (like generative chatbots) to index, retrieve, and present relevant data to users. Here’s why ensuring that CLMS documents are available to web crawlers is essential:</p>
+<ul>
+<li><p><strong>Increased Accessibility for Diverse Users</strong>: CLMS products cater to a broad audience, including government agencies, NGOs, scientists, and the public. Properly formatted and exposed documentation allows these users to easily find and access data via search engines. Web crawlers can efficiently index CLMS products, simplifying the search for specific datasets without navigating complex databases.</p></li>
+<li><p><strong>Enhanced Search Engine Optimization (SEO)</strong>: CLMS products cater to a broad audience, including government agencies, NGOs, scientists, and the public. Properly formatted and exposed documentation allows these users to easily find and access data via search engines. Web crawlers can efficiently index CLMS products, simplifying the search for specific datasets without navigating complex databases.</p></li>
+<li><p><strong>Improved Product Findability Through AI and Chatbots</strong>: AI-powered search tools and chatbots use indexed information to generate responses. By ensuring that CLMS documentation is structured for crawling, CLMS products become accessible to third-party chatbots, expanding their reach through natural language queries and conversational interfaces.</p></li>
+<li><p><strong>Faster and More Accurate Data Retrieval</strong>: Well-formatted CLMS documents enable faster and more accurate data retrieval, essential for time-sensitive applications like disaster management. Proper crawling ensures that search engines and AI systems provide up-to-date CLMS products, crucial for timely decision-making.</p></li>
+<li><p><strong>Standardization and Interoperability</strong>: Adopting CLMS standards and formats like Quarto Markdown ensures consistency, making documents easier to index and retrieve. Standardization promotes interoperability, allowing CLMS data to be used across various platforms, including AI systems and environmental tools.</p></li>
+<li><p><strong>Global Reach and Broader Impact</strong>: Making CLMS documents available to web crawlers increases their global reach. Optimized data allows users worldwide to access key environmental information, contributing to global initiatives, research, and policymaking beyond the EU.</p></li>
+<li><p><strong>Supporting Third-Party Integration</strong>: Third-party platforms rely on web crawlers and AI tools to access CLMS data. By exposing CLMS products to crawlers, the data can be integrated into various tools and services, enhancing discoverability and promoting broader use in AI-driven analytics and public services.</p></li>
+</ul>
+<p>By making CLMS documents available to web crawlers using standardized formats such as HTML, PDF, and DOCX (which adhere to semantic structure, web standards, and use metadata), CLMS can ensure that its products are easily indexed, retrieved, and integrated into a variety of search engines, artificial intelligence systems, and chatbots. This strategy not only increases the visibility of CLMS products, but also improves accessibility to a global audience, ensuring that researchers, policymakers, and the public can effectively find and use CLMS data. At a time when timely, accurate environmental data is becoming increasingly important, optimizing CLMS products for web crawlers is a necessary step to ensure that everyone has access to these valuable resources.</p>
+</section>
+<section id="web-crawling-and-information-provisioning-for-generative-chatbots" class="level2">
+<h2 class="anchored" data-anchor-id="web-crawling-and-information-provisioning-for-generative-chatbots">1.3. Web crawling and Information Provisioning for Generative Chatbots</h2>
+<p>Web crawling is the process used by search engines to explore and index the web pages of websites. The crawler downloads pages, reads the content, and adds it to the search engine’s index. Crawlers are designed to navigate from one page to another by following hyperlinks, allowing them to efficiently cover a website’s entire structure. Search engines rely on crawlers to keep their results up-to-date by regularly visiting websites and checking for new or modified content. <strong>Googlebot</strong>, <strong>Bingbot</strong>, and <strong>Yahoo Slurp</strong> are some example of popular web crawlers. Key terms involved in web crawling are:</p>
+<ul>
+<li><strong>Search engine</strong>: A system that allows users to search for content on the web.</li>
+<li><strong>Indexing</strong>: The process of storing web content so it can be retrieved later.</li>
+<li><strong>Web pages</strong>: Documents that make up the web, interconnected by hyperlinks.</li>
+<li><strong>Hyperlinks</strong>: Links that connect different web pages, forming a navigable web.</li>
+</ul>
+<p>Web crawling has become essential for search engines and AI applications. The integration of these technologies has been explored extensively <span class="citation" data-cites="khder2021web massimino2016accessing kausar2013web saini2016information"><a href="#ref-khder2021web" role="doc-biblioref">[2]</a>, <a href="#ref-massimino2016accessing" role="doc-biblioref">[3]</a>, <a href="#ref-kausar2013web" role="doc-biblioref">[4]</a>, <a href="#ref-saini2016information" role="doc-biblioref">[5]</a></span>. The growth of digital content has placed significant demands on the efficiency and accuracy of web crawlers and artificial intelligence (AI) models <span class="citation" data-cites="hernandez2019deep deshmukh2021survey"><a href="#ref-hernandez2019deep" role="doc-biblioref">[6]</a>, <a href="#ref-deshmukh2021survey" role="doc-biblioref">[7]</a></span>. In response, Content Lifecycle Management Standards (CLMS) are essential for establishing uniformity in the way data is formatted, structured, and exposed for automated tools like crawlers and AI training datasets. CLMS helps ensure that content is easy to access, interpret, and process, leading to more accurate information retrieval and AI model training. This document outlines the development of CLMS standards for exposing information to web crawlers and optimizing the formatting for AI data ingestion. <a href="#fig-ai-training" class="quarto-xref">Figure&nbsp;1</a> focuses on the working of a web crawler <span class="citation" data-cites="Crawl"><a href="#ref-Crawl" role="doc-biblioref">[8]</a></span>.</p>
+<div id="fig-ai-training" class="quarto-float quarto-figure quarto-figure-center anchored" data-align="center">
+<figure class="quarto-float quarto-float-fig figure">
+<div aria-describedby="fig-ai-training-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
+<img src="images/web_crawling_diagram.png" class="img-fluid figure-img" style="width:65.0%" data-align="center">
+</div>
+<figcaption class="quarto-float-caption-bottom quarto-float-caption quarto-float-fig" id="fig-ai-training-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
+Figure&nbsp;1: Diagram illustrating web crawling <span class="citation" data-cites="Crawl"><a href="#ref-Crawl" role="doc-biblioref">[8]</a></span>.
+</figcaption>
+</figure>
+</div>
+<p>In recent years, generative chatbots have made great progress and become powerful tools that allow users to access detailed information and conduct complex queries. In particular, chatbots can help users explore certain aspects of CLMS products, such as allocation rules or the purpose of a particular product. These tools are not only critical for product discoverability, but also improve user understanding of CLMS products. To ensure that chatbots effectively help users find and understand CLMS products, it is important that the underlying information is formatted and presented in a way that is easy to find and use. This requires well-structured documentation and a system that allows web crawlers and AI models to effectively access and process CLMS data.</p>
+<p>Web crawlers and AI models are critical to the discoverability of online information. Web crawlers that index websites rely on well-structured content to perform their tasks effectively. Similarly, generative AI models, including chatbots, require high-quality structured data to produce accurate and meaningful results. CLMS provides important environmental data, but in order for this data to be useful to AI models and easy for users to find, it must be properly formatted and made available.</p>
+<section id="motivation" class="level3">
+<h3 class="anchored" data-anchor-id="motivation">1.3.1. Motivation</h3>
+<p>The relationship between AI and web crawlers has led to new frontiers in both industries. The primary motivation for creating CLMS standards lies in the need for:</p>
+<ul>
+<li><p><strong>Improved Crawling Efficiency</strong>: Properly formatted content with metadata helps crawlers index relevant information faster and more accurately.</p></li>
+<li><p><strong>Better AI Model Training</strong>: Consistent content structure ensures that AI models are trained on high-quality, organized data.</p></li>
+<li><p><strong>Data Accessibility</strong>: Standardizing the structure of content ensures that information is universally accessible across platforms.</p></li>
+</ul>
+<p>The following key aspects are critical for ensuring that data is structured and accessible for web crawlers and AI systems:</p>
+<ul>
+<li><p><strong>Uniform metadata</strong>: Consistent metadata usage across all content is essential. Metadata includes details like title, author, keywords, and publication date. Uniform metadata ensures that web crawlers and AI systems can easily index and categorize content, improving searchability and discoverability.</p></li>
+<li><p><strong>Clearly defined content sections</strong>: Content should be organized into distinct sections, such as titles, headings, and subheadings. This structured format helps both users and machines navigate through the content efficiently, making key information easy to locate and retrieve.</p></li>
+<li><p><strong>Embedded structured data formats</strong>: Incorporating structured data formats such as <strong>JSON-LD</strong>, <strong>RDF</strong>, or <strong>XML</strong> provides a precise way of representing information. These formats help web crawlers and AI systems understand relationships and attributes within the content, facilitating accurate extraction, interpretation, and use of the data across various platforms.</p></li>
+</ul>
+</section>
+<section id="importance" class="level3">
+<h3 class="anchored" data-anchor-id="importance">1.3.2. Importance</h3>
+<ul>
+<li><p><strong>Enhanced Web Crawling</strong>: Properly structured CLMS content will improve web crawlers’ ability to index and retrieve information.</p></li>
+<li><p><strong>Improved AI Training</strong>: Structured data ensures higher-quality datasets, which result in better-trained AI models, particularly for generative chatbots.</p></li>
+<li><p><strong>Better User Experience</strong>: By improving product discoverability and findability, users will have an easier time accessing and understanding CLMS products.</p></li>
+</ul>
+<div class="callout callout-style-default callout-tip callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Tip
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>Given the growing complexity of CLMS products and the increasing reliance on generative AI tools, it is critical to implement standards that improve the discoverability and usability of CLMS data.</p>
+</div>
+</div>
+<div class="callout callout-style-default callout-note callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Note
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>By standardizing the format and delivery of CLMS information, our goal is to ensure that generative AI applications, such as web crawlers and chatbots, can effectively access and use this data.</p>
+</div>
+</div>
+</section>
+</section>
+</section>
+<section id="content-standards" class="level1">
+<h1>2. Content Standards</h1>
+<p>Developing content standards requires collaboration between content creators, data engineers, and AI researchers. The process typically follows these stages for different document types in use:</p>
+<section id="content-structuring" class="level2">
+<h2 class="anchored" data-anchor-id="content-structuring">2.1. Content Structuring</h2>
+<p>Content structuring involves organizing data into recognizable, standard components, such as:</p>
+<ul>
+<li><p><strong>Title</strong>: Main identifier of the content.</p></li>
+<li><p><strong>Metadata</strong>: Information about the content, including authors, dates, keywords, and relevant classification.</p></li>
+<li><p><strong>Headings and Subheadings</strong>: Structured sections that break down the content into digestible parts.</p></li>
+</ul>
+<p>The example of Metadata formatting has been given below:</p>
+<div class="sourceCode" id="cb1"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="fu">title</span><span class="kw">:</span><span class="at"> </span><span class="st">"Developing CLMS Standards for Generative AI Training and Web Crawlers"</span></span>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="fu">subtitle</span><span class="kw">:</span><span class="at"> </span><span class="st">"Task 10.1: Information Provisioning for Generative Chatbots"</span></span>
+<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="fu">author</span><span class="kw">:</span><span class="at"> </span><span class="st">"Ayan Chatterjee, Department of DIGITAL, NILU, ayan@nilu.no."</span></span>
+<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="fu">date</span><span class="kw">:</span><span class="at"> </span><span class="st">"2024-09-10"</span></span>
+<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="fu">sitemap</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span><span class="co">           #Enables sitemap generation for web crawlers</span></span>
+<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a><span class="fu">toc</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span><span class="co">              # Enable the Table of Contents</span></span>
+<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a><span class="fu">toc-title</span><span class="kw">:</span><span class="at"> </span><span class="st">"Index"</span><span class="co">      # Customize the title of the table of contents</span></span>
+<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a><span class="fu">toc-depth</span><span class="kw">:</span><span class="at"> </span><span class="dv">3</span><span class="co">            # Include headings up to level 3 (</span><span class="al">###</span><span class="co">)</span></span>
+<span id="cb1-10"><a href="#cb1-10" aria-hidden="true" tabindex="-1"></a><span class="fu">keywords</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"CLMS standards"</span><span class="kw">,</span><span class="at"> </span><span class="st">"web crawlers"</span><span class="kw">,</span><span class="at"> </span><span class="st">"AI training"</span><span class="kw">,</span><span class="at"> </span><span class="st">"information formatting"</span><span class="kw">]</span></span>
+<span id="cb1-11"><a href="#cb1-11" aria-hidden="true" tabindex="-1"></a><span class="fu">bibliography</span><span class="kw">:</span><span class="at"> references.bib</span><span class="co">   # Link to the bibliography file</span></span>
+<span id="cb1-12"><a href="#cb1-12" aria-hidden="true" tabindex="-1"></a><span class="fu">csl</span><span class="kw">:</span><span class="at"> ieee.csl</span><span class="co">                  # Link to the CSL file for IEEE style</span></span>
+<span id="cb1-13"><a href="#cb1-13" aria-hidden="true" tabindex="-1"></a><span class="fu">format</span><span class="kw">:</span><span class="at"> </span></span>
+<span id="cb1-14"><a href="#cb1-14" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">html</span><span class="kw">:</span><span class="at"> default</span></span>
+<span id="cb1-15"><a href="#cb1-15" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">pdf</span><span class="kw">:</span><span class="at"> default</span></span>
+<span id="cb1-16"><a href="#cb1-16" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">docx</span><span class="kw">:</span><span class="at"> default</span></span>
+<span id="cb1-17"><a href="#cb1-17" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="html-structuring" class="level2">
+<h2 class="anchored" data-anchor-id="html-structuring">2.2. HTML Structuring</h2>
+<p>The following structured approach in HTML allows web crawlers to effectively index and retrieve content while facilitating AI training for generative models, ensuring that information is both accessible and usable:</p>
+<section id="semantic-structuring-and-formatting" class="level3">
+<h3 class="anchored" data-anchor-id="semantic-structuring-and-formatting">2.2.1. Semantic Structuring and Formatting</h3>
+<p>It is used to enhance both <strong>machine readability</strong> and <strong>user comprehension</strong>, we must follow structured and semantic formatting principles. This includes using HTML5 elements, schema markup, and providing clear metadata. Using HTML5 semantic elements like <code>&lt;article&gt;</code>, <code>&lt;section&gt;</code>, <code>&lt;header&gt;</code>, and <code>&lt;footer&gt;</code> helps structure the document meaningfully. For example:</p>
+<div class="sourceCode" id="cb2"><pre class="sourceCode html code-with-copy"><code class="sourceCode html"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="dt">&lt;</span><span class="kw">article</span><span class="dt">&gt;</span></span>
+<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a>  <span class="dt">&lt;</span><span class="kw">header</span><span class="dt">&gt;</span></span>
+<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a>    <span class="dt">&lt;</span><span class="kw">h1</span><span class="dt">&gt;</span>Understanding Web Crawlers<span class="dt">&lt;/</span><span class="kw">h1</span><span class="dt">&gt;</span></span>
+<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a>    <span class="dt">&lt;</span><span class="kw">meta</span><span class="ot"> name</span><span class="op">=</span><span class="st">"description"</span><span class="ot"> content</span><span class="op">=</span><span class="st">"How web crawlers work and index ..!"</span><span class="ot"> </span><span class="dt">/&gt;</span></span>
+<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a>  <span class="dt">&lt;/</span><span class="kw">header</span><span class="dt">&gt;</span></span>
+<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a>  <span class="dt">&lt;</span><span class="kw">section</span><span class="dt">&gt;</span></span>
+<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a>    <span class="dt">&lt;</span><span class="kw">h2</span><span class="dt">&gt;</span>How Crawlers Index Content<span class="dt">&lt;/</span><span class="kw">h2</span><span class="dt">&gt;</span></span>
+<span id="cb2-8"><a href="#cb2-8" aria-hidden="true" tabindex="-1"></a>    <span class="dt">&lt;</span><span class="kw">p</span><span class="dt">&gt;</span>Web crawlers use semantic structure to efficiently index web pages.<span class="dt">&lt;/</span><span class="kw">p</span><span class="dt">&gt;</span></span>
+<span id="cb2-9"><a href="#cb2-9" aria-hidden="true" tabindex="-1"></a>  <span class="dt">&lt;/</span><span class="kw">section</span><span class="dt">&gt;</span></span>
+<span id="cb2-10"><a href="#cb2-10" aria-hidden="true" tabindex="-1"></a>  <span class="dt">&lt;</span><span class="kw">footer</span><span class="dt">&gt;</span></span>
+<span id="cb2-11"><a href="#cb2-11" aria-hidden="true" tabindex="-1"></a>    <span class="dt">&lt;</span><span class="kw">p</span><span class="dt">&gt;</span>Author: Ayan Chatterjee<span class="dt">&lt;/</span><span class="kw">p</span><span class="dt">&gt;</span></span>
+<span id="cb2-12"><a href="#cb2-12" aria-hidden="true" tabindex="-1"></a>  <span class="dt">&lt;/</span><span class="kw">footer</span><span class="dt">&gt;</span></span>
+<span id="cb2-13"><a href="#cb2-13" aria-hidden="true" tabindex="-1"></a><span class="dt">&lt;/</span><span class="kw">article</span><span class="dt">&gt;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="microdata-for-enhancing-machine-readability" class="level3">
+<h3 class="anchored" data-anchor-id="microdata-for-enhancing-machine-readability">2.2.2. Microdata for Enhancing Machine Readability</h3>
+<p>Microdata attributes such as itemscope, itemtype, and itemprop provide semantic clarity for machines, enabling more efficient crawling and interpretation.</p>
+<div class="sourceCode" id="cb3"><pre class="sourceCode html code-with-copy"><code class="sourceCode html"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="dt">&lt;</span><span class="kw">article</span><span class="ot"> itemscope itemtype</span><span class="op">=</span><span class="st">"https://schema.org/Article"</span><span class="dt">&gt;</span></span>
+<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a>  <span class="dt">&lt;</span><span class="kw">header</span><span class="dt">&gt;</span></span>
+<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a>    <span class="dt">&lt;</span><span class="kw">h1</span><span class="ot"> itemprop</span><span class="op">=</span><span class="st">"headline"</span><span class="dt">&gt;</span>Web Crawling Explained<span class="dt">&lt;/</span><span class="kw">h1</span><span class="dt">&gt;</span></span>
+<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a>    <span class="dt">&lt;</span><span class="kw">meta</span><span class="ot"> itemprop</span><span class="op">=</span><span class="st">"description"</span><span class="ot"> content</span><span class="op">=</span><span class="st">"How web crawlers index ..?"</span><span class="ot"> </span><span class="dt">/&gt;</span></span>
+<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a>  <span class="dt">&lt;/</span><span class="kw">header</span><span class="dt">&gt;</span></span>
+<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a><span class="dt">&lt;/</span><span class="kw">article</span><span class="dt">&gt;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="schema-markup-for-structured-content" class="level3">
+<h3 class="anchored" data-anchor-id="schema-markup-for-structured-content">2.2.3. Schema Markup for Structured Content</h3>
+<p>Use Schema Markup (like ResearchArticle, Dataset, or CreativeWork) to define the content type and enhance machine readability. This helps both web crawlers and AI to categorize content accurately.</p>
+<div class="sourceCode" id="cb4"><pre class="sourceCode html code-with-copy"><code class="sourceCode html"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="dt">&lt;</span><span class="kw">article</span><span class="ot"> itemscope itemtype</span><span class="op">=</span><span class="st">"https://schema.org/ResearchArticle"</span><span class="dt">&gt;</span></span>
+<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>  <span class="dt">&lt;</span><span class="kw">header</span><span class="dt">&gt;</span></span>
+<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a>    <span class="dt">&lt;</span><span class="kw">h1</span><span class="ot"> itemprop</span><span class="op">=</span><span class="st">"headline"</span><span class="dt">&gt;</span>AI Training for Web Crawlers<span class="dt">&lt;/</span><span class="kw">h1</span><span class="dt">&gt;</span></span>
+<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a>    <span class="dt">&lt;</span><span class="kw">meta</span><span class="ot"> itemprop</span><span class="op">=</span><span class="st">"description"</span><span class="ot"> content</span><span class="op">=</span><span class="st">" AI training techniques for .."</span><span class="ot"> </span><span class="dt">/&gt;</span></span>
+<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a>  <span class="dt">&lt;/</span><span class="kw">header</span><span class="dt">&gt;</span></span>
+<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a><span class="dt">&lt;/</span><span class="kw">article</span><span class="dt">&gt;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="headings-and-subheadings" class="level3">
+<h3 class="anchored" data-anchor-id="headings-and-subheadings">2.2.4. Headings and Subheadings</h3>
+<p>Provide clearly defined headings and subheadings to organize content for easier navigation and indexing by crawlers.</p>
+<div class="sourceCode" id="cb5"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span>
+<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a><span class="co"># How AI Models are Trained</span></span>
+<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a><span class="co">## Data Collection</span></span>
+<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a><span class="co">## Model Training</span></span>
+<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a><span class="co">## Evaluation</span></span>
+<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="alt-text-and-descriptions" class="level3">
+<h3 class="anchored" data-anchor-id="alt-text-and-descriptions">2.2.5. Alt Text and Descriptions</h3>
+<p>For images and diagrams, always provide alt text and descriptions to improve accessibility.</p>
+<div class="sourceCode" id="cb6"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="at">![A diagram illustrating how web crawlers work]</span></span>
+<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a><span class="at">(images/web_crawlers.png){alt="A diagram of web crawler processes" width=50%}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="meta-tags-and-descriptions" class="level3">
+<h3 class="anchored" data-anchor-id="meta-tags-and-descriptions">2.2.6. Meta Tags and Descriptions</h3>
+<p>Add meta tags and descriptions to help web crawlers index the content more accurately</p>
+<div class="sourceCode" id="cb7"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="at">&lt;meta name="description" content="How web crawlers work effectively!" /&gt;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="phrasing-and-content-presentation" class="level3">
+<h3 class="anchored" data-anchor-id="phrasing-and-content-presentation">2.2.7. Phrasing and Content Presentation</h3>
+<p>Ensure that important keywords are present in titles, headings, and throughout the content without overusing them (avoid keyword stuffing).</p>
+<div class="sourceCode" id="cb8"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Introduction to Web Crawlers and AI Training</span></span>
+<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a><span class="at">Web crawlers, also known as spiders, are used by search engines to index web ...</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Write in a clear and concise manner. Avoid jargon unless necessary, and ensure that key concepts are easy to understand.</p>
+<div class="sourceCode" id="cb9"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="at">Web crawlers automatically scan websites to collect and index content. </span></span>
+<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a><span class="at">They follow links, downloading web pages and saving them for future queries.</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Use hyperlinks and cross-references to guide both users and web crawlers to related content.</p>
+<div class="sourceCode" id="cb10"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="at">For more details, see the [Introduction to AI Training](</span><span class="co">#data-collection).</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Provide a brief abstract or summary at the beginning of each article or section for better clarity and indexing.</p>
+<div class="sourceCode" id="cb11"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="ot">**Summary:**</span><span class="at"> This article provides an overview of indexing content, </span></span>
+<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a><span class="at">and their integration with AI.</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="structured-data-repositories" class="level3">
+<h3 class="anchored" data-anchor-id="structured-data-repositories">2.2.8. Structured Data Repositories</h3>
+<p>It is used to enable knowledge transfer to generative AI, use standardized formats like JSON-LD, RDF, or XML to define metadata and structure.</p>
+<div class="sourceCode" id="cb12"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="kw">{</span></span>
+<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">"@context"</span><span class="kw">:</span><span class="at"> </span><span class="st">"https://schema.org"</span><span class="kw">,</span></span>
+<span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">"@type"</span><span class="kw">:</span><span class="at"> </span><span class="st">"Dataset"</span><span class="kw">,</span></span>
+<span id="cb12-4"><a href="#cb12-4" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">"name"</span><span class="kw">:</span><span class="at"> </span><span class="st">"AI Training Dataset"</span><span class="kw">,</span></span>
+<span id="cb12-5"><a href="#cb12-5" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">"description"</span><span class="kw">:</span><span class="at"> </span><span class="st">"A dataset designed to improve search engine crawlers."</span></span>
+<span id="cb12-6"><a href="#cb12-6" aria-hidden="true" tabindex="-1"></a><span class="kw">}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode" id="cb13"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="at">&lt;dataset xmlns="http://www.w3.org/2001/XMLSchema-instance" type="AI Training Dataset"&gt;</span></span>
+<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a><span class="at">  &lt;name&gt;AI Training Dataset&lt;/name&gt;</span></span>
+<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a><span class="at">  &lt;description&gt;A dataset designed for training AI models.&lt;/description&gt;</span></span>
+<span id="cb13-4"><a href="#cb13-4" aria-hidden="true" tabindex="-1"></a><span class="at">&lt;/dataset&gt;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+</section>
+<section id="pdf-structuring" class="level2">
+<h2 class="anchored" data-anchor-id="pdf-structuring">2.3. PDF Structuring</h2>
+<p>The following structured approach in PDF will improve documents for indexing by web crawlers, integration with AI systems, and overall improved accessibility for users:</p>
+<section id="accessible-pdf-formats-by-tagging" class="level3">
+<h3 class="anchored" data-anchor-id="accessible-pdf-formats-by-tagging">2.3.1. Accessible PDF Formats by Tagging</h3>
+<p>Ensure that the PDF is tagged properly so that screen readers and AI tools can interpret the document structure. For instance, headings, paragraphs, and lists should be tagged semantically.</p>
+<div class="sourceCode" id="cb14"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Heading 1 (tagged as &lt;h1&gt;)</span></span>
+<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a><span class="kw">-</span><span class="at"> List item 1 (tagged as &lt;ul&gt;&lt;li&gt;)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="structuring-and-formatting" class="level3">
+<h3 class="anchored" data-anchor-id="structuring-and-formatting">2.3.2. Structuring and Formatting</h3>
+<p>The document structure should be accessible, with a clear hierarchy and a clickable table of contents (TOC). Accessible tagging, hierarchical organization, and text over image improve the usability for both humans and machines.</p>
+<p>Organize content into a well-defined hierarchy using headings (#, ##, ###). This improves both user navigation and machine parsing for AI and web crawlers.</p>
+<div class="sourceCode" id="cb15"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="co">## Section 1: Introduction</span></span>
+<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a><span class="co">### Subsection 1.1: Overview</span></span>
+<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a><span class="fu">toc</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
+<span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a><span class="fu">toc-depth</span><span class="kw">:</span><span class="at"> </span><span class="dv">2</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="adding-metadata" class="level3">
+<h3 class="anchored" data-anchor-id="adding-metadata">2.3.3. Adding Metadata</h3>
+<p>Embedding metadata such as document properties (e.g., Title, Author, Subject, and Keywords), XMP metadata, Schema.org metadata, and descriptive metadata helps search engines and AI systems index, categorize, and retrieve information efficiently.</p>
+<div class="sourceCode" id="cb16"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a><span class="fu">title</span><span class="kw">:</span><span class="at"> </span><span class="st">"PDF Structuring and Formatting"</span></span>
+<span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a><span class="fu">author</span><span class="kw">:</span><span class="at"> </span><span class="st">"Ayan Chatterjee"</span></span>
+<span id="cb16-3"><a href="#cb16-3" aria-hidden="true" tabindex="-1"></a><span class="fu">subject</span><span class="kw">:</span><span class="at"> </span><span class="st">"Document Accessibility and Metadata"</span></span>
+<span id="cb16-4"><a href="#cb16-4" aria-hidden="true" tabindex="-1"></a><span class="fu">keywords</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"PDF accessibility"</span><span class="kw">,</span><span class="at"> </span><span class="st">"metadata"</span><span class="kw">,</span><span class="at"> </span><span class="st">"AI integration"</span><span class="kw">]</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>XMP metadata is stored as XML in the PDF file, allowing for rich data descriptions. Schema.org metadata in JSON-LD provide structured information that AI and web crawlers can easily understand.</p>
+<div class="sourceCode" id="cb17"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a><span class="kw">{</span></span>
+<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">"@context"</span><span class="kw">:</span><span class="at"> </span><span class="st">"https://schema.org"</span><span class="kw">,</span></span>
+<span id="cb17-3"><a href="#cb17-3" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">"@type"</span><span class="kw">:</span><span class="at"> </span><span class="st">"CreativeWork"</span><span class="kw">,</span></span>
+<span id="cb17-4"><a href="#cb17-4" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">"name"</span><span class="kw">:</span><span class="at"> </span><span class="st">"PDF Structuring and Formatting"</span><span class="kw">,</span></span>
+<span id="cb17-5"><a href="#cb17-5" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">"author"</span><span class="kw">:</span><span class="at"> </span><span class="kw">{</span></span>
+<span id="cb17-6"><a href="#cb17-6" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">"@type"</span><span class="kw">:</span><span class="at"> </span><span class="st">"Person"</span><span class="kw">,</span></span>
+<span id="cb17-7"><a href="#cb17-7" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">"name"</span><span class="kw">:</span><span class="at"> </span><span class="st">"Jane Doe"</span></span>
+<span id="cb17-8"><a href="#cb17-8" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="kw">},</span></span>
+<span id="cb17-9"><a href="#cb17-9" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">"keywords"</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"PDF accessibility"</span><span class="kw">,</span><span class="at"> </span><span class="st">"metadata"</span><span class="kw">,</span><span class="at"> </span><span class="st">"AI integration"</span><span class="kw">]</span></span>
+<span id="cb17-10"><a href="#cb17-10" aria-hidden="true" tabindex="-1"></a><span class="kw">}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="optimizing-content-presentation" class="level3">
+<h3 class="anchored" data-anchor-id="optimizing-content-presentation">2.3.4. Optimizing Content Presentation</h3>
+<p>Ensuring the proper placement of keywords, providing alt text for images, and correctly labeling figures and tables contribute to the searchability and accessibility of the content. This is crucial for effective interaction with web crawlers and AI models.</p>
+<div class="sourceCode" id="cb18"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a><span class="fu">Keywords</span><span class="kw">:</span><span class="at"> PDF accessibility, web crawlers, generative AI</span></span>
+<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a><span class="at">![A flowchart showing the PDF processing workflow](path/to/image.png){alt="PDF workflow"}</span></span>
+<span id="cb18-3"><a href="#cb18-3" aria-hidden="true" tabindex="-1"></a><span class="fu">![Figure 1</span><span class="kw">:</span><span class="at"> A table of contents structure](path/to/image.png){</span><span class="co">#fig-toc}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="setting-up-for-knowledge-transfer-to-generative-ai" class="level3">
+<h3 class="anchored" data-anchor-id="setting-up-for-knowledge-transfer-to-generative-ai">2.3.5. Setting Up for Knowledge Transfer to Generative AI</h3>
+<p>Using machine-readable fonts (e.g., Arial, Times New Roman), a clean and simple layout, and adding comments or annotations helps prepare the document for use in generative AI systems. AI models benefit from well-structured and easy-to-parse content, which improves their ability to understand and generate meaningful responses based on the content.</p>
+<div class="sourceCode" id="cb19"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a><span class="co">## Section 1: Overview</span></span>
+<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a><span class="at">This section introduces the importance of accessible PDFs for AI processing...</span></span>
+<span id="cb19-3"><a href="#cb19-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb19-4"><a href="#cb19-4" aria-hidden="true" tabindex="-1"></a><span class="at">&lt;!-- This annotation explains the role of hierarchical metadata for AI --&gt;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="callout callout-style-default callout-important callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Important
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>By such structured practices, we can ensure that the content is both human-readable and machine-readable, facilitating easy discovery by web crawlers and seamless integration with AI training systems.</p>
+</div>
+</div>
+</section>
+</section>
+</section>
+<section id="developing-clms-standards" class="level1">
+<h1>3. Developing CLMS Standards</h1>
+<p>In the context of <strong>Developing CLMS Standards</strong>, it is essential to utilize advanced tools that support both the creation of well-structured documents and the easy discoverability of content for web crawlers and AI systems. Several tools are available for content formatting, documentation, and publication. Among these, <strong>Quarto</strong> stands out due to its versatility, allowing users to create, format, and publish documents in multiple formats (HTML, PDF, Word) with integrated support for code execution and structured content.</p>
+<p>This section compares several of these tools, explaining why <strong>Quarto</strong> is particularly suitable for creating CLMS-compliant documentation. We’ll also cover how to configure Quarto with <strong>Jupyter Notebooks</strong> and the importance of using <strong>Quarto Markdown</strong> for CLMS content. A Quarto Markdown file provides a structured approach to documenting the development of CLMS standards, ensuring content is easily accessible by both web crawlers and AI systems.</p>
+<section id="tools-for-clms-documentation" class="level2">
+<h2 class="anchored" data-anchor-id="tools-for-clms-documentation">3.1. Tools for CLMS Documentation</h2>
+<ul>
+<li><p><strong>Quarto</strong>: Quarto is a highly versatile tool for creating and publishing documents, including PDFs, with rich formatting, code integration, and support for multiple formats (HTML, PDF, Word). Quarto’s cross-platform capabilities make it ideal for creating structured and searchable documents for CLMS, supporting web crawlers and AI applications.</p></li>
+<li><p><strong>R Markdown</strong>: A popular tool in the R community that allows users to combine narrative text with R code, producing output in HTML, PDF, and Word formats. Though powerful for statistical analysis, it is more limited in non-R-based workflows compared to Quarto.</p></li>
+<li><p><strong>Jupyter Notebooks</strong>: An interactive tool supporting over 40 programming languages, commonly used for data science and computing. Notebooks can be exported to multiple formats (HTML, PDF, slides), but lack Quarto’s advanced content formatting features.</p></li>
+<li><p><strong>Pandoc</strong>: A universal document converter that enables conversion between various markup formats, including Markdown, LaTeX, and HTML. While powerful for conversions, Pandoc lacks the code integration and dynamic formatting of Quarto.</p></li>
+<li><p><strong>LaTeX</strong>: A document preparation system for producing scientific and technical documents. While highly customizable, it requires significant expertise and lacks the ease of Markdown tools like Quarto.</p></li>
+<li><p><strong>Hugo</strong>: A static site generator used for creating websites and blogs from Markdown files. While efficient for websites, it doesn’t provide the same level of document control and integration as Quarto.</p></li>
+<li><p><strong>Sphinx</strong>: A documentation generator mainly used for Python projects. It supports conversion to formats like HTML and PDF but lacks the cross-language support and document versatility of Quarto.</p></li>
+<li><p><strong>Bookdown</strong>: An extension of R Markdown, designed for writing books and long documents. It supports multiple output formats but is mostly R-focused, while Quarto supports multiple languages.</p></li>
+<li><p><strong>GitBook</strong>: A tool for creating documentation and books using Markdown. It allows collaboration but lacks the dynamic formatting and multi-language support found in Quarto.</p></li>
+<li><p><strong>Pelican</strong>: A static site generator that uses Markdown or reStructuredText. Best suited for blogs, it doesn’t provide the integrated support for complex documents required by CLMS standards.</p></li>
+<li><p><strong>Typora</strong>: A WYSIWYG Markdown editor that offers easy editing but lacks the advanced document control and integration capabilities that Quarto provides.</p></li>
+</ul>
+<p>The comparison of tools for CLMS documentation as shown in below <a href="#tbl-indexing" class="quarto-xref">Table&nbsp;1</a>. As shown in <a href="#tbl-indexing" class="quarto-xref">Table&nbsp;1</a>, Quarto outperforms other tools in terms of supported output formats and reproducibility.</p>
+<div id="tbl-indexing" class="quarto-float quarto-figure quarto-figure-center anchored">
+<figure class="quarto-float quarto-float-tbl figure">
+<figcaption class="quarto-float-caption-top quarto-float-caption quarto-float-tbl" id="tbl-indexing-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
+Table&nbsp;1: Comparative analysis of Quarto versus other formatting tools.
+</figcaption>
+<div aria-describedby="tbl-indexing-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
+<table class="caption-top table">
+<colgroup>
+<col style="width: 14%">
+<col style="width: 20%">
+<col style="width: 14%">
+<col style="width: 15%">
+<col style="width: 20%">
+<col style="width: 16%">
+</colgroup>
+<thead>
+<tr class="header">
+<th>Tool</th>
+<th>Cross-Language Support</th>
+<th>Output Formats</th>
+<th>Code Integration</th>
+<th>Static Site Generation</th>
+<th>Ideal Use Case</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td><strong>Quarto</strong></td>
+<td>Yes</td>
+<td>HTML, PDF, Word</td>
+<td>Yes</td>
+<td>Yes</td>
+<td>Reports, blogs, CLMS docs</td>
+</tr>
+<tr class="even">
+<td>R Markdown</td>
+<td>R only</td>
+<td>HTML, PDF, Word</td>
+<td>Yes (R)</td>
+<td>No</td>
+<td>Statistical reports</td>
+</tr>
+<tr class="odd">
+<td>Jupyter Notebooks</td>
+<td>40+ languages</td>
+<td>HTML, PDF</td>
+<td>Yes</td>
+<td>No</td>
+<td>Data Science</td>
+</tr>
+<tr class="even">
+<td>LaTeX</td>
+<td>Limited</td>
+<td>PDF, HTML</td>
+<td>No</td>
+<td>No</td>
+<td>Scientific papers</td>
+</tr>
+<tr class="odd">
+<td>Hugo</td>
+<td>No</td>
+<td>HTML</td>
+<td>No</td>
+<td>Yes</td>
+<td>Blogs, websites</td>
+</tr>
+<tr class="even">
+<td>Sphinx</td>
+<td>Python</td>
+<td>HTML, PDF</td>
+<td>No</td>
+<td>Yes</td>
+<td>Python documentation</td>
+</tr>
+</tbody>
+</table>
+</div>
+</figure>
+</div>
+</section>
+<section id="quarto-markdown" class="level2">
+<h2 class="anchored" data-anchor-id="quarto-markdown">3.2. Quarto Markdown</h2>
+<p>Markdown is a lightweight, easy-to-read syntax used for formatting plain text documents <span class="citation" data-cites="cookintroduction mati2023eviewsr paciorek2023example"><a href="#ref-cookintroduction" role="doc-biblioref">[9]</a>, <a href="#ref-mati2023eviewsr" role="doc-biblioref">[10]</a>, <a href="#ref-paciorek2023example" role="doc-biblioref">[11]</a></span>. In Quarto, Markdown is extended to support additional features beyond standard Markdown, allowing users to write text, integrate code, and generate richly formatted documents in various formats such as HTML, PDF, and Word <span class="citation" data-cites="cookintroduction mati2023eviewsr paciorek2023example"><a href="#ref-cookintroduction" role="doc-biblioref">[9]</a>, <a href="#ref-mati2023eviewsr" role="doc-biblioref">[10]</a>, <a href="#ref-paciorek2023example" role="doc-biblioref">[11]</a></span>. Quarto Markdown combines the simplicity of regular Markdown with powerful features for document rendering, making it ideal for data analysis, technical writing, academic papers, and reports <span class="citation" data-cites="cookintroduction mati2023eviewsr paciorek2023example"><a href="#ref-cookintroduction" role="doc-biblioref">[9]</a>, <a href="#ref-mati2023eviewsr" role="doc-biblioref">[10]</a>, <a href="#ref-paciorek2023example" role="doc-biblioref">[11]</a></span>.</p>
+<p>Quarto Markdown uses the standard Markdown syntax for headings, lists, emphasis, and links, while also supporting enhanced features like cross-referencing, citations, figures, tables, mathematical equations, and more <span class="citation" data-cites="cookintroduction mati2023eviewsr paciorek2023example"><a href="#ref-cookintroduction" role="doc-biblioref">[9]</a>, <a href="#ref-mati2023eviewsr" role="doc-biblioref">[10]</a>, <a href="#ref-paciorek2023example" role="doc-biblioref">[11]</a></span>. Quarto also allows for code execution in multiple programming languages (such as Python, R, and Julia) embedded within the Markdown file, enabling dynamic document creation where the outputs are generated directly from the code <span class="citation" data-cites="cookintroduction mati2023eviewsr paciorek2023example miroshnychenko2023quarto"><a href="#ref-cookintroduction" role="doc-biblioref">[9]</a>, <a href="#ref-mati2023eviewsr" role="doc-biblioref">[10]</a>, <a href="#ref-paciorek2023example" role="doc-biblioref">[11]</a>, <a href="#ref-miroshnychenko2023quarto" role="doc-biblioref">[12]</a></span>.</p>
+<p>Key features of <strong>Markdown</strong> in <strong>Quarto</strong> are:</p>
+<ul>
+<li><strong>Standard Markdown</strong>: Supports headings, lists, links, images, bold, italics, etc.</li>
+<li><strong>YAML Header</strong>: Allows users to specify metadata like title, author, date, and output formats (HTML, PDF, Word) at the start of the document.</li>
+<li><strong>Cross-references</strong>: Provides automatic numbering and referencing for figures, tables, sections, etc.</li>
+<li><strong>Code Execution</strong>: Integrates code cells for multiple programming languages, making it possible to run code and include its outputs directly in the document.</li>
+<li><strong>Mathematics and Equations</strong>: Supports LaTeX-style equations for technical writing.</li>
+<li><strong>Citations</strong>: Allows for referencing research papers and articles using BibTeX or CSL styles.</li>
+<li><strong>Multi-output Format</strong>: Enables seamless conversion to multiple formats like HTML, PDF, Word, presentations, and slides.</li>
+</ul>
+<section id="significance" class="level3">
+<h3 class="anchored" data-anchor-id="significance">3.2.1. Significance</h3>
+<p>Markdown in Quarto can be significant due to its <strong>simplicity and flexibility</strong> for CLMS documentation. With an <strong>easy-to-use syntax</strong>, it allows users to format text without requiring complex tools, making it accessible to both non-technical users and programmers. This flexibility enables the creation of a wide variety of documents, ranging from blog posts to scientific reports. Quarto extends standard Markdown by supporting <strong>rich formatting options</strong> essential for technical and academic writing, including built-in support for tables, figures, equations, footnotes, and cross-referencing. The <strong>integration of code and text</strong> is another powerful feature, allowing Quarto Markdown to embed code execution within documents. This is critical for reproducible research, enabling the inclusion of tables, charts, and figures generated directly from code, making it highly suitable for data science and technical reporting. Additionally, Quarto Markdown supports <strong>multi-format output</strong>, allowing users to create content once and export it to multiple formats like HTML, PDF, and Word, streamlining document preparation for different audiences. When used for online content, its structured format <strong>improves SEO (Search Engine Optimization)</strong>, making it easier for search engines to index and enhance discoverability. The ease of <strong>managing references, citations, and cross-references</strong> further strengthens its utility in academic and research documentation. Since Markdown files are plain text, Quarto seamlessly integrates with <strong>version control</strong> tools like Git, enabling easy <strong>collaboration</strong> among multiple contributors, especially in open-source and research communities. Finally, Quarto Markdown’s versatility in document creation extends across blogs, technical documentation, reports, scientific papers, and books, making it an ideal tool for content creators across various disciplines.</p>
+</section>
+<section id="configuring-quarto-with-jupyter-notebooks" class="level3">
+<h3 class="anchored" data-anchor-id="configuring-quarto-with-jupyter-notebooks">3.2.2. Configuring Quarto with Jupyter Notebooks</h3>
+<p>To integrate <strong>Quarto</strong> with <strong>Jupyter Notebooks</strong>:</p>
+<ul>
+<li><p><strong>Install Quarto</strong>: Download and install Quarto from <a href="https://quarto.org/docs/get-started/">Quarto.org</a>.</p></li>
+<li><p><strong>Install Jupyter</strong>: Ensure you have Jupyter installed. If not, install it using <code>pip</code>: ```bash pip install notebook</p></li>
+<li><p><strong>Rendering</strong>: You can directly write your content in Jupyter Notebooks and then render the notebook using Quarto to multiple formats: ```bash<br>
+quarto render your-notebook.ipynb –to html quarto render your-notebook.ipynb –to pdf quarto render your-notebook.ipynb –to docx</p></li>
+<li><p><strong>YAML Header in Jupyter</strong>:</p>
+<div class="sourceCode" id="cb20"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span>
+<span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a><span class="fu">title</span><span class="kw">:</span><span class="at"> </span><span class="st">"CLMS Data Analysis"</span></span>
+<span id="cb20-3"><a href="#cb20-3" aria-hidden="true" tabindex="-1"></a><span class="fu">author</span><span class="kw">:</span><span class="at"> </span><span class="st">"Ayan Chatterjee"</span></span>
+<span id="cb20-4"><a href="#cb20-4" aria-hidden="true" tabindex="-1"></a><span class="fu">format</span><span class="kw">:</span></span>
+<span id="cb20-5"><a href="#cb20-5" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">html</span><span class="kw">:</span><span class="at"> default</span></span>
+<span id="cb20-6"><a href="#cb20-6" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">pdf</span><span class="kw">:</span><span class="at"> default</span></span>
+<span id="cb20-7"><a href="#cb20-7" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">docx</span><span class="kw">:</span><span class="at"> default    </span></span>
+<span id="cb20-8"><a href="#cb20-8" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div></li>
+</ul>
+</section>
+</section>
+<section id="indexing" class="level2">
+<h2 class="anchored" data-anchor-id="indexing">3.3. Indexing</h2>
+<p>Proper indexing is essential for increasing the discoverability and accessibility of CLMS products <span class="citation" data-cites="hassan2014improving coe2016website"><a href="#ref-hassan2014improving" role="doc-biblioref">[13]</a>, <a href="#ref-coe2016website" role="doc-biblioref">[14]</a></span>. By formatting documents using Quarto Markdown and generating a sitemap.xml, we can ensure that search engines and AI systems efficiently crawl and retrieve CLMS content <span class="citation" data-cites="hassan2014improving coe2016website"><a href="#ref-hassan2014improving" role="doc-biblioref">[13]</a>, <a href="#ref-coe2016website" role="doc-biblioref">[14]</a></span>. Top improve document indexing for enhanced discoverability and accessibility we can adopt the following approaches:</p>
+<ul>
+<li>Organize content using <strong>structured headers</strong> and <strong>metadata</strong> in Quarto Markdown.</li>
+<li>Use proper keywords and descriptions in the document metadata.</li>
+<li>Cross-reference related documents to create interconnected content that helps crawlers navigate.</li>
+</ul>
+<div class="sourceCode" id="cb21"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span>
+<span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a><span class="fu">title</span><span class="kw">:</span><span class="at"> </span><span class="st">"Land Use Mapping with CLMS Data"</span></span>
+<span id="cb21-3"><a href="#cb21-3" aria-hidden="true" tabindex="-1"></a><span class="fu">author</span><span class="kw">:</span><span class="at"> </span><span class="st">"Ayan Chatterjee"</span></span>
+<span id="cb21-4"><a href="#cb21-4" aria-hidden="true" tabindex="-1"></a><span class="fu">date</span><span class="kw">:</span><span class="at"> </span><span class="st">"2024-08-01"</span></span>
+<span id="cb21-5"><a href="#cb21-5" aria-hidden="true" tabindex="-1"></a><span class="fu">keywords</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"land use"</span><span class="kw">,</span><span class="at"> </span><span class="st">"CLMS"</span><span class="kw">,</span><span class="at"> </span><span class="st">"mapping"</span><span class="kw">,</span><span class="at"> </span><span class="st">"environment"</span><span class="kw">]</span></span>
+<span id="cb21-6"><a href="#cb21-6" aria-hidden="true" tabindex="-1"></a><span class="fu">description</span><span class="kw">:</span><span class="at"> </span><span class="st">"A detailed report on how CLMS data."</span></span>
+<span id="cb21-7"><a href="#cb21-7" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<section id="sitemap-generation" class="level3">
+<h3 class="anchored" data-anchor-id="sitemap-generation">3.3.1. Sitemap Generation</h3>
+<p>A sitemap.xml helps web crawlers discover all the content on the website <span class="citation" data-cites="hassan2014improving coe2016website"><a href="#ref-hassan2014improving" role="doc-biblioref">[13]</a>, <a href="#ref-coe2016website" role="doc-biblioref">[14]</a></span>. By providing a clear roadmap, crawlers can index each document, ensuring that all CLMS resources are available for search and AI training. By using <strong>Quarto Markdown</strong> and generating a <strong>sitemap.xml</strong>, CLMS documents can be structured in a way that improves their <strong>indexing</strong>, making them more <strong>discoverable</strong> by search engines and AI systems. This approach ensures efficient crawling, improves search engine ranking, and enhances the accessibility of CLMS products for users and AI models alike.</p>
+<ul>
+<li><strong>Search Engine Discoverability</strong>: Users and AI systems can easily find the indexed CLMS documents.</li>
+<li><strong>Efficient Crawling</strong>: The sitemap provides a roadmap, allowing for faster and more accurate indexing.</li>
+<li><strong>Increased Accessibility</strong>: Properly indexed documents are easier for users and AI to retrieve and utilize, improving the overall product visibility.</li>
+</ul>
+<div class="sourceCode" id="cb22"><pre class="sourceCode html code-with-copy"><code class="sourceCode html"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a><span class="dt">&lt;</span><span class="kw">urlset</span><span class="ot"> xmlns</span><span class="op">=</span><span class="st">"http://www.sitemaps.org/schemas/sitemap/0.9"</span><span class="dt">&gt;</span></span>
+<span id="cb22-2"><a href="#cb22-2" aria-hidden="true" tabindex="-1"></a>   <span class="dt">&lt;</span><span class="kw">url</span><span class="dt">&gt;</span></span>
+<span id="cb22-3"><a href="#cb22-3" aria-hidden="true" tabindex="-1"></a>      <span class="dt">&lt;</span><span class="kw">loc</span><span class="dt">&gt;</span>http://example.com/clms/land-use-mapping<span class="dt">&lt;/</span><span class="kw">loc</span><span class="dt">&gt;</span></span>
+<span id="cb22-4"><a href="#cb22-4" aria-hidden="true" tabindex="-1"></a>      <span class="dt">&lt;</span><span class="kw">lastmod</span><span class="dt">&gt;</span>2024-08-01<span class="dt">&lt;/</span><span class="kw">lastmod</span><span class="dt">&gt;</span></span>
+<span id="cb22-5"><a href="#cb22-5" aria-hidden="true" tabindex="-1"></a>      <span class="dt">&lt;</span><span class="kw">changefreq</span><span class="dt">&gt;</span>monthly<span class="dt">&lt;/</span><span class="kw">changefreq</span><span class="dt">&gt;</span></span>
+<span id="cb22-6"><a href="#cb22-6" aria-hidden="true" tabindex="-1"></a>   <span class="dt">&lt;/</span><span class="kw">url</span><span class="dt">&gt;</span></span>
+<span id="cb22-7"><a href="#cb22-7" aria-hidden="true" tabindex="-1"></a>   <span class="dt">&lt;</span><span class="kw">url</span><span class="dt">&gt;</span></span>
+<span id="cb22-8"><a href="#cb22-8" aria-hidden="true" tabindex="-1"></a>      <span class="dt">&lt;</span><span class="kw">loc</span><span class="dt">&gt;</span>http://example.com/clms/land-cover-change<span class="dt">&lt;/</span><span class="kw">loc</span><span class="dt">&gt;</span></span>
+<span id="cb22-9"><a href="#cb22-9" aria-hidden="true" tabindex="-1"></a>      <span class="dt">&lt;</span><span class="kw">lastmod</span><span class="dt">&gt;</span>2024-07-15<span class="dt">&lt;/</span><span class="kw">lastmod</span><span class="dt">&gt;</span></span>
+<span id="cb22-10"><a href="#cb22-10" aria-hidden="true" tabindex="-1"></a>      <span class="dt">&lt;</span><span class="kw">changefreq</span><span class="dt">&gt;</span>monthly<span class="dt">&lt;/</span><span class="kw">changefreq</span><span class="dt">&gt;</span></span>
+<span id="cb22-11"><a href="#cb22-11" aria-hidden="true" tabindex="-1"></a>   <span class="dt">&lt;/</span><span class="kw">url</span><span class="dt">&gt;</span></span>
+<span id="cb22-12"><a href="#cb22-12" aria-hidden="true" tabindex="-1"></a><span class="dt">&lt;/</span><span class="kw">urlset</span><span class="dt">&gt;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="steps-to-implement-and-submit-the-sitemap" class="level3">
+<h3 class="anchored" data-anchor-id="steps-to-implement-and-submit-the-sitemap">3.3.2. Steps to Implement and Submit the Sitemap</h3>
+<ul>
+<li><p><strong>Generate the Sitemap</strong>: Use a sitemap generator tool (e.g., XML-Sitemaps or Screaming Frog) to create a sitemap, or have it generated automatically by a CMS like WordPress or a static site generator like Hugo.</p></li>
+<li><p><strong>Upload the Sitemap</strong>: Once generated, place the sitemap.xml file in the root directory of your website, e.g., https://www.example.com/sitemap.xml.</p></li>
+<li><p><strong>Submit to Search Engines</strong>: Submit your sitemap to search engines via tools like Google Search Console and Bing Webmaster Tools. This helps search engines index your site properly.</p></li>
+</ul>
+</section>
+<section id="enhancing-indexing-for-web-crawlers-and-ai-models" class="level3">
+<h3 class="anchored" data-anchor-id="enhancing-indexing-for-web-crawlers-and-ai-models">3.3.3. Enhancing Indexing for Web Crawlers and AI Models</h3>
+<p>To ensure that CLMS documents are findable and accessible to web crawlers and AI models, it’s important to implement proper steps for generating and submitting a sitemap and using structured data (such as metadata and JSON-LD) to enhance indexing.</p>
+<ul>
+<li><p><strong>Descriptive Filenames</strong>: Use filenames that clearly describe the content of the document. For instance, instead of doc1.md, use clms-land-monitoring-data.md.</p></li>
+<li><p><strong>Metadata</strong>: Add descriptive metadata in your Quarto Markdown files (e.g., title, author, keywords). This helps search engines and AI models understand the content better.</p></li>
+<li><p><strong>Text Content</strong>: Ensure that text content is descriptive and structured using headings and subheadings to guide crawlers.</p></li>
+<li><p><strong>HTML Metadata and JSON-LD Structured Data</strong>: Use HTML metadata and JSON-LD structured data within the Quarto document to improve how your content is indexed by search engines and used by AI training systems.</p></li>
+</ul>
+<p>The following Quarto Markdown YAML header example demonstrates how to enhance document visibility for web crawling and AI training by including metadata and structured data. This can be part of your CLMS documentation to ensure that it is well-indexed and easy to discover.</p>
+<div class="sourceCode" id="cb23"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span>
+<span id="cb23-2"><a href="#cb23-2" aria-hidden="true" tabindex="-1"></a><span class="fu">title</span><span class="kw">:</span><span class="at"> </span><span class="st">"CLMS Land Monitoring Data"</span></span>
+<span id="cb23-3"><a href="#cb23-3" aria-hidden="true" tabindex="-1"></a><span class="fu">author</span><span class="kw">:</span><span class="at"> </span><span class="st">"Ayan Chatterjee"</span></span>
+<span id="cb23-4"><a href="#cb23-4" aria-hidden="true" tabindex="-1"></a><span class="fu">date</span><span class="kw">:</span><span class="at"> </span><span class="st">"2024-09-15"</span></span>
+<span id="cb23-5"><a href="#cb23-5" aria-hidden="true" tabindex="-1"></a><span class="fu">keywords</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"CLMS"</span><span class="kw">,</span><span class="at"> </span><span class="st">"web crawling"</span><span class="kw">,</span><span class="at"> </span><span class="st">"AI training"</span><span class="kw">,</span><span class="at"> </span><span class="st">"environmental data"</span><span class="kw">]</span></span>
+<span id="cb23-6"><a href="#cb23-6" aria-hidden="true" tabindex="-1"></a><span class="fu">description</span><span class="kw">:</span><span class="at"> </span><span class="st">"Comprehensive overview of CLMS land monitoring datasets, ......"</span></span>
+<span id="cb23-7"><a href="#cb23-7" aria-hidden="true" tabindex="-1"></a><span class="fu">sitemap</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span><span class="co">  # Flag to include this document in the sitemap</span></span>
+<span id="cb23-8"><a href="#cb23-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb23-9"><a href="#cb23-9" aria-hidden="true" tabindex="-1"></a><span class="co"># HTML metadata for SEO and discoverability</span></span>
+<span id="cb23-10"><a href="#cb23-10" aria-hidden="true" tabindex="-1"></a><span class="fu">meta</span><span class="kw">:</span></span>
+<span id="cb23-11"><a href="#cb23-11" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="kw">-</span><span class="at"> </span><span class="fu">name</span><span class="kw">:</span><span class="at"> </span><span class="st">"description"</span></span>
+<span id="cb23-12"><a href="#cb23-12" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">content</span><span class="kw">:</span><span class="at"> </span><span class="st">"CLMS land monitoring datasets for environmental and climate ..."</span></span>
+<span id="cb23-13"><a href="#cb23-13" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="kw">-</span><span class="at"> </span><span class="fu">name</span><span class="kw">:</span><span class="at"> </span><span class="st">"keywords"</span></span>
+<span id="cb23-14"><a href="#cb23-14" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">content</span><span class="kw">:</span><span class="at"> </span><span class="st">"CLMS, land monitoring, environmental data, AI, web crawling"</span></span>
+<span id="cb23-15"><a href="#cb23-15" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb23-16"><a href="#cb23-16" aria-hidden="true" tabindex="-1"></a><span class="co"># JSON-LD structured data to help search engines and AI understand the content</span></span>
+<span id="cb23-17"><a href="#cb23-17" aria-hidden="true" tabindex="-1"></a><span class="fu">json-ld</span><span class="kw">:</span></span>
+<span id="cb23-18"><a href="#cb23-18" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="kw">-</span><span class="at"> </span><span class="st">"@context"</span><span class="kw">:</span><span class="at"> </span><span class="st">"https://schema.org"</span></span>
+<span id="cb23-19"><a href="#cb23-19" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">"@type"</span><span class="kw">:</span><span class="at"> </span><span class="st">"Dataset"</span></span>
+<span id="cb23-20"><a href="#cb23-20" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">"name"</span><span class="kw">:</span><span class="at"> </span><span class="st">"CLMS Land Monitoring Data"</span></span>
+<span id="cb23-21"><a href="#cb23-21" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">"description"</span><span class="kw">:</span><span class="at"> </span><span class="st">"Detailed data on land monitoring and ...."</span></span>
+<span id="cb23-22"><a href="#cb23-22" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">"url"</span><span class="kw">:</span><span class="at"> </span><span class="st">"https://www.example.com/clms-land-monitoring-data"</span></span>
+<span id="cb23-23"><a href="#cb23-23" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">"keywords"</span><span class="kw">:</span><span class="at"> </span><span class="st">"land monitoring, environmental data, AI training.."</span></span>
+<span id="cb23-24"><a href="#cb23-24" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">"datePublished"</span><span class="kw">:</span><span class="at"> </span><span class="st">"2024-09-15"</span></span>
+<span id="cb23-25"><a href="#cb23-25" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">"creator"</span><span class="kw">:</span></span>
+<span id="cb23-26"><a href="#cb23-26" aria-hidden="true" tabindex="-1"></a><span class="at">      </span><span class="fu">"@type"</span><span class="kw">:</span><span class="at"> </span><span class="st">"Organization"</span></span>
+<span id="cb23-27"><a href="#cb23-27" aria-hidden="true" tabindex="-1"></a><span class="at">      </span><span class="fu">"name"</span><span class="kw">:</span><span class="at"> </span><span class="st">"Copernicus Land Monitoring Service"</span></span>
+<span id="cb23-28"><a href="#cb23-28" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">"publisher"</span><span class="kw">:</span></span>
+<span id="cb23-29"><a href="#cb23-29" aria-hidden="true" tabindex="-1"></a><span class="at">      </span><span class="fu">"@type"</span><span class="kw">:</span><span class="at"> </span><span class="st">"Organization"</span></span>
+<span id="cb23-30"><a href="#cb23-30" aria-hidden="true" tabindex="-1"></a><span class="at">      </span><span class="fu">"name"</span><span class="kw">:</span><span class="at"> </span><span class="st">"European Environment Agency"</span></span>
+<span id="cb23-31"><a href="#cb23-31" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="callout callout-style-default callout-important callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Important
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p><strong>Quarto</strong> stands out as the most versatile tool for creating CLMS-compliant documents, with cross-language support, integration of code, multiple output formats, and the ability to generate static websites.</p>
+</div>
+</div>
+<div class="callout callout-style-default callout-important callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Important
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>To ensure that CLMS documents are findable and accessible to web crawlers and AI models, it’s important to implement proper steps for generating and submitting a sitemap and using structured data (such as metadata and JSON-LD) to enhance indexing.</p>
+</div>
+</div>
+</section>
+</section>
+</section>
+<section id="recommended-standards-for-information-formatting" class="level1">
+<h1>4. Recommended Standards for Information Formatting</h1>
+<section id="suggested-standards" class="level2">
+<h2 class="anchored" data-anchor-id="suggested-standards">4.1. Suggested Standards</h2>
+<p>One of the main challenges in this task is improving the findability and discoverability of CLMS products. With the extensive range of data and services offered by CLMS, users often struggle to locate specific datasets or resources. Chatbots serve as a potential solution by guiding users to the appropriate resources. For chatbots to effectively perform this role, the data must be properly structured, categorized, and indexed. To support this:</p>
+<ul>
+<li><p>Documentation must be <strong>accessible to third-party chatbots</strong>. While CLMS chatbots will be the primary interaction point, external platforms should also access and retrieve relevant data. Exposing CLMS data in a structured and standardized format ensures interoperability across various chatbot systems, enhancing discoverability.</p></li>
+<li><p>Recommendations will be provided on how CLMS should <strong>format and expose information</strong>. These guidelines will focus on best practices for metadata structuring, content organization, and linkable resources to optimize data formatting.</p></li>
+</ul>
+<p>The recommended standards for CLMS will include the use of Quarto Markdown, sitemaps, and structured metadata for web crawlers and AI systems.</p>
+<ul>
+<li><p><strong>Using Quarto Markdown for Data Structuring</strong>: Quarto Markdown allows for the clear organization of data, with structured sections such as headings, subheadings, and metadata fields. This makes it easier for web crawlers and AI systems to navigate the content and retrieve relevant information. Additionally, by using cross-referencing within Quarto Markdown documents, CLMS products and resources can be interconnected, providing users with a more seamless exploration experience.</p></li>
+<li><p><strong>Implementing Sitemaps for Efficient Crawling</strong>: Sitemaps provide a roadmap for web crawlers, ensuring that all relevant pages and data sources are indexed. By creating comprehensive sitemaps that expose the entirety of the CLMS data repository, the task ensures that web crawlers and AI systems can efficiently discover and retrieve content. This is essential for making CLMS data easily accessible to third-party chatbots and AI platforms.</p></li>
+</ul>
+</section>
+<section id="guideline-for-the-process-verification" class="level2">
+<h2 class="anchored" data-anchor-id="guideline-for-the-process-verification">4.2. Guideline for the Process Verification</h2>
+<p>We can compare the results of the search queries for both the unformatted and formatted documents. Typically, formatted documents with clear structure and metadata should provide better search accuracy because they provide more semantic meaning and context, making it easier for the search engine to retrieve relevant information. In this sub-section, we have outlined a step-by-step process for preparing and indexing documents to improve search accuracy. The focus is on comparing unformatted documents to formatted ones using Quarto Markdown, and how sitemap integration enhances search engine results.</p>
+<ul>
+<li>Step 1: Document Preparation
+<ul>
+<li>Create <strong>unformatted text</strong>, <strong>PDF</strong>, or <strong>Markdown files</strong>.</li>
+<li>Create <strong>formatted documents</strong> using <strong>Quarto</strong>, which include metadata, clear headings, and semantic structure.</li>
+</ul></li>
+<li>Step 2: Sitemap Generation
+<ul>
+<li>For the formatted documents, generate a <strong>sitemap</strong> in XML format.</li>
+<li>The sitemap should list all document URLs along with relevant metadata (e.g., last modified date, frequency of changes).</li>
+</ul></li>
+<li>Step 3: Set Up Search Engine
+<ul>
+<li>Choose a simple search engine library, such as <strong>Whoosh</strong>.</li>
+<li>Create a <strong>search index</strong> for both sets of documents (formatted and unformatted).</li>
+<li>Ensure that metadata is included in the search index for the formatted documents.</li>
+</ul></li>
+<li>Step 4: Develop Web Crawler
+<ul>
+<li>Write a simple <strong>web crawler</strong> to crawl both unformatted and formatted documents.</li>
+<li>For the formatted documents, ensure the crawler uses the <strong>sitemap</strong> to guide the indexing process.</li>
+</ul></li>
+<li>Step 5: Test Search Accuracy
+<ul>
+<li>Perform search queries for common terms in both unformatted and formatted datasets.</li>
+<li>Measure the relevance of search results using metrics like <strong>precision</strong>, <strong>recall</strong>, and <strong>F1 score</strong>.</li>
+</ul></li>
+<li>Step 6: Analyze Results
+<ul>
+<li>Compare the performance of the search engine on unformatted versus formatted documents.</li>
+<li><strong>Hypothesis</strong>: Documents with structure and a sitemap will produce better search accuracy, yielding higher relevance in the results.</li>
+</ul></li>
+</ul>
+<p>This Quarto Markdown setup can be used in a Jupyter notebook (ipynb) under a single section, maintaining clarity and structure in both the notebook and final rendered outputs (e.g., HTML, PDF, or DOCX).</p>
+</section>
+</section>
+<section id="conclusion" class="level1">
+<h1>5. Conclusion</h1>
+<p>The <strong>European Environment Agency (EEA)</strong> recognizes the growing need for generative chatbots and natural language analysis tools to facilitate easy access to CLMS data. In response, the EEA is undertaking preparatory efforts to establish the necessary standards and infrastructure for successful chatbot integration. These activities focus on ensuring that CLMS products are <strong>findable</strong> and <strong>discoverable</strong>, enabling users, regardless of technical expertise, to access environmental data seamlessly.</p>
+<p>A key part of this strategy is making CLMS documentation and data accessible to third-party generative AI platforms. By implementing standards for formatting and exposing information—particularly through <strong>Quarto Markdown</strong> and <strong>sitemaps</strong>—CLMS ensures that high-quality, structured data is available to chatbots and AI systems. This not only enhances product discoverability but also improves user experience, allowing chatbots to guide users through complex datasets and environmental resources.</p>
+<p>The collaboration between CLMS and the EEA lays the groundwork for a future where AI systems can efficiently retrieve and process environmental data, supporting informed decision-making and increasing public engagement with CLMS products.</p>
+</section>
+<section id="references" class="level1">
+<h1>6. References</h1>
+</section>
+</div>
+
+
+<div id="quarto-appendix" class="default"><section class="quarto-appendix-contents" role="doc-bibliography" id="quarto-bibliography"><h2 class="anchored quarto-appendix-heading">References</h2><div id="refs" class="references csl-bib-body" data-entry-spacing="0" role="list">
+<div id="ref-CLMS" class="csl-entry" role="listitem">
+<div class="csl-left-margin">[1] </div><div class="csl-right-inline">E. Project, <span>“CLMS - copernicus land monitoring service.”</span> 2024. Available: <a href="https://land.copernicus.eu/en">https://land.copernicus.eu/en</a></div>
+</div>
+<div id="ref-khder2021web" class="csl-entry" role="listitem">
+<div class="csl-left-margin">[2] </div><div class="csl-right-inline">M. A. Khder, <span>“Web scraping or web crawling: State of art, techniques, approaches and application.”</span> <em>International Journal of Advances in Soft Computing &amp; Its Applications</em>, vol. 13, no. 3, 2021.</div>
+</div>
+<div id="ref-massimino2016accessing" class="csl-entry" role="listitem">
+<div class="csl-left-margin">[3] </div><div class="csl-right-inline">B. Massimino, <span>“Accessing online data: Web-crawling and information-scraping techniques to automate the assembly of research data,”</span> <em>Journal of Business Logistics</em>, vol. 37, no. 1, pp. 34–42, 2016.</div>
+</div>
+<div id="ref-kausar2013web" class="csl-entry" role="listitem">
+<div class="csl-left-margin">[4] </div><div class="csl-right-inline">M. A. Kausar, V. Dhaka, and S. K. Singh, <span>“Web crawler: A review,”</span> <em>International Journal of Computer Applications</em>, vol. 63, no. 2, pp. 31–36, 2013.</div>
+</div>
+<div id="ref-saini2016information" class="csl-entry" role="listitem">
+<div class="csl-left-margin">[5] </div><div class="csl-right-inline">C. Saini and V. Arora, <span>“Information retrieval in web crawling: A survey,”</span> in <em>2016 international conference on advances in computing, communications and informatics (ICACCI)</em>, IEEE, 2016, pp. 2635–2643.</div>
+</div>
+<div id="ref-hernandez2019deep" class="csl-entry" role="listitem">
+<div class="csl-left-margin">[6] </div><div class="csl-right-inline">I. Hernández, C. R. Rivero, and D. Ruiz, <span>“Deep web crawling: A survey,”</span> <em>World Wide Web</em>, vol. 22, pp. 1577–1610, 2019.</div>
+</div>
+<div id="ref-deshmukh2021survey" class="csl-entry" role="listitem">
+<div class="csl-left-margin">[7] </div><div class="csl-right-inline">S. Deshmukh and K. Vishwakarma, <span>“A survey on crawlers used in developing search engine,”</span> in <em>2021 5th international conference on intelligent computing and control systems (ICICCS)</em>, IEEE, 2021, pp. 1446–1452.</div>
+</div>
+<div id="ref-Crawl" class="csl-entry" role="listitem">
+<div class="csl-left-margin">[8] </div><div class="csl-right-inline">Octoparse, <span>“Web crawl.”</span> 2024. Available: <a href="https://www.octoparse.com/">https://www.octoparse.com/</a></div>
+</div>
+<div id="ref-cookintroduction" class="csl-entry" role="listitem">
+<div class="csl-left-margin">[9] </div><div class="csl-right-inline">J. J. Cook, <span>“An introduction to quarto: A versatile open-source tool for data reporting and visualization.”</span></div>
+</div>
+<div id="ref-mati2023eviewsr" class="csl-entry" role="listitem">
+<div class="csl-left-margin">[10] </div><div class="csl-right-inline">S. Mati, I. Civcir, and S. I. Abba, <span>“EviewsR: An r package for dynamic and reproducible research using EViews, r, r markdown and quarto.”</span> <em>R Journal</em>, vol. 15, no. 2, 2023.</div>
+</div>
+<div id="ref-paciorek2023example" class="csl-entry" role="listitem">
+<div class="csl-left-margin">[11] </div><div class="csl-right-inline">C. Paciorek, <span>“An example quarto markdown file,”</span> 2023.</div>
+</div>
+<div id="ref-miroshnychenko2023quarto" class="csl-entry" role="listitem">
+<div class="csl-left-margin">[12] </div><div class="csl-right-inline">I. Miroshnychenko, <span>“QUARTO: REVOLUTIONIZING CONTENT CREATION,”</span> <em>Volume editor: Vitaliy Snytyuk, Dr. Sc., Prof. Program Committee: Aldrich Chris, Andreas Pester, Frederic Mallet, Hiroshi Tanaka, Iurii Krak, Yulia Khlevna, Karsten Henke, Oleg Chertov, Oleksandr Kuchanskyi, Oleksandr Marchenko, S<span>á</span>ndor Boz<span>ó</span>ki, Vitaliy Tsyganok, Vladimir Vovk Organizing Committee: Anatoly Anisimov, Vitaliy Snytyuk, Oleksii Bychkov, Oleh Ilarionov, Yuriі</em>, p. 189, 2023.</div>
+</div>
+<div id="ref-hassan2014improving" class="csl-entry" role="listitem">
+<div class="csl-left-margin">[13] </div><div class="csl-right-inline">R. F. Hassan and S. Hussain, <span>“Improving the web indexing quality through a website-search engine coactions,”</span> <em>International Journal of Computer and Information Technology</em>, vol. 3, no. 2, 2014.</div>
+</div>
+<div id="ref-coe2016website" class="csl-entry" role="listitem">
+<div class="csl-left-margin">[14] </div><div class="csl-right-inline">M. Coe, <span>“Website indexing,”</span> <em>The Indexer: The International Journal of Indexing</em>, vol. 34, no. 1, pp. 20–25, 2016.</div>
+</div>
+</div></section></div></main>
+<!-- /main column -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    // For code content inside modals, clipBoardJS needs to be initialized with a container option
+    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp('/' + window.location.host + '/');
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      if (note) {
+        return note.innerHTML;
+      } else {
+        return "";
+      }
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/guidelines.html b/guidelines.html
new file mode 100644
index 0000000..6fbe370
--- /dev/null
+++ b/guidelines.html
@@ -0,0 +1,906 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.5.56">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+<meta name="author" content="Ayan Chatterjee, NILU DIGITAL">
+<meta name="dcterms.date" content="2024-10-30">
+<meta name="keywords" content="SEO, web crawling, Quarto Markdown, HTML">
+<meta name="description" content="This document provides guidelines for using Quarto Markdown in HTML for web crawling.">
+
+<title>Guidelines for Using Quarto Markdown</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+
+<script src="guidelines_files/libs/clipboard/clipboard.min.js"></script>
+<script src="guidelines_files/libs/quarto-html/quarto.js"></script>
+<script src="guidelines_files/libs/quarto-html/popper.min.js"></script>
+<script src="guidelines_files/libs/quarto-html/tippy.umd.min.js"></script>
+<script src="guidelines_files/libs/quarto-html/anchor.min.js"></script>
+<link href="guidelines_files/libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="guidelines_files/libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="guidelines_files/libs/bootstrap/bootstrap.min.js"></script>
+<link href="guidelines_files/libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="guidelines_files/libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
+
+
+</head>
+
+<body>
+
+<div id="quarto-content" class="page-columns page-rows-contents page-layout-article">
+<div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+  <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">Contents</h2>
+   
+  <ul>
+  <li><a href="#quarto-markdown-configuration-for-multiple-output-formats" id="toc-quarto-markdown-configuration-for-multiple-output-formats" class="nav-link active" data-scroll-target="#quarto-markdown-configuration-for-multiple-output-formats">Quarto Markdown Configuration for Multiple Output Formats</a></li>
+  <li><a href="#quarto-markdown-in-html-for-web-crawling" id="toc-quarto-markdown-in-html-for-web-crawling" class="nav-link" data-scroll-target="#quarto-markdown-in-html-for-web-crawling">Quarto Markdown in HTML for Web Crawling</a>
+  <ul class="collapse">
+  <li><a href="#prerequisites" id="toc-prerequisites" class="nav-link" data-scroll-target="#prerequisites">Prerequisites</a></li>
+  <li><a href="#basic-setup-in-rstudio" id="toc-basic-setup-in-rstudio" class="nav-link" data-scroll-target="#basic-setup-in-rstudio">Basic Setup in RStudio</a></li>
+  </ul></li>
+  <li><a href="#essential-html-structure" id="toc-essential-html-structure" class="nav-link" data-scroll-target="#essential-html-structure">Essential HTML Structure</a>
+  <ul class="collapse">
+  <li><a href="#title-and-meta-description" id="toc-title-and-meta-description" class="nav-link" data-scroll-target="#title-and-meta-description">Title and Meta Description</a></li>
+  <li><a href="#headings-and-subheadings" id="toc-headings-and-subheadings" class="nav-link" data-scroll-target="#headings-and-subheadings">Headings and Subheadings</a></li>
+  <li><a href="#linking-structure" id="toc-linking-structure" class="nav-link" data-scroll-target="#linking-structure">Linking Structure</a></li>
+  <li><a href="#image-alt-text-and-descriptions" id="toc-image-alt-text-and-descriptions" class="nav-link" data-scroll-target="#image-alt-text-and-descriptions">Image Alt Text and Descriptions</a></li>
+  <li><a href="#tables-in-quarto-markdown" id="toc-tables-in-quarto-markdown" class="nav-link" data-scroll-target="#tables-in-quarto-markdown">Tables in Quarto Markdown</a></li>
+  <li><a href="#complex-table-with-row-and-column-spans" id="toc-complex-table-with-row-and-column-spans" class="nav-link" data-scroll-target="#complex-table-with-row-and-column-spans">Complex Table with Row and Column Spans</a></li>
+  </ul></li>
+  <li><a href="#html-sitemap-generation-for-web-crawling" id="toc-html-sitemap-generation-for-web-crawling" class="nav-link" data-scroll-target="#html-sitemap-generation-for-web-crawling">HTML Sitemap Generation for Web Crawling</a>
+  <ul class="collapse">
+  <li><a href="#sample-sitemap-configuration" id="toc-sample-sitemap-configuration" class="nav-link" data-scroll-target="#sample-sitemap-configuration">Sample Sitemap Configuration</a></li>
+  <li><a href="#customizing-sitemap" id="toc-customizing-sitemap" class="nav-link" data-scroll-target="#customizing-sitemap">Customizing Sitemap</a></li>
+  <li><a href="#additional-metadata-for-social-media-and-crawlers" id="toc-additional-metadata-for-social-media-and-crawlers" class="nav-link" data-scroll-target="#additional-metadata-for-social-media-and-crawlers">Additional Metadata for Social Media and Crawlers</a></li>
+  </ul></li>
+  <li><a href="#quarto-syntax-for-key-seo-components" id="toc-quarto-syntax-for-key-seo-components" class="nav-link" data-scroll-target="#quarto-syntax-for-key-seo-components">Quarto Syntax for Key SEO Components</a>
+  <ul class="collapse">
+  <li><a href="#structured-data-with-json-ld" id="toc-structured-data-with-json-ld" class="nav-link" data-scroll-target="#structured-data-with-json-ld">Structured Data with JSON-LD</a></li>
+  <li><a href="#linking-external-stylesheets-and-javascript" id="toc-linking-external-stylesheets-and-javascript" class="nav-link" data-scroll-target="#linking-external-stylesheets-and-javascript">Linking External Stylesheets and JavaScript</a></li>
+  </ul></li>
+  <li><a href="#rendering-the-quarto-document-in-html" id="toc-rendering-the-quarto-document-in-html" class="nav-link" data-scroll-target="#rendering-the-quarto-document-in-html">Rendering the Quarto Document in HTML</a>
+  <ul class="collapse">
+  <li><a href="#render-in-rstudio" id="toc-render-in-rstudio" class="nav-link" data-scroll-target="#render-in-rstudio">Render in RStudio</a></li>
+  <li><a href="#preview-in-browser" id="toc-preview-in-browser" class="nav-link" data-scroll-target="#preview-in-browser">Preview in Browser</a></li>
+  </ul></li>
+  <li><a href="#best-practices-checklist-for-accessible-and-seo-optimized-documents" id="toc-best-practices-checklist-for-accessible-and-seo-optimized-documents" class="nav-link" data-scroll-target="#best-practices-checklist-for-accessible-and-seo-optimized-documents">Best Practices Checklist for Accessible and SEO-Optimized Documents</a>
+  <ul class="collapse">
+  <li><a href="#general-document-best-practices" id="toc-general-document-best-practices" class="nav-link" data-scroll-target="#general-document-best-practices">General Document Best Practices</a></li>
+  <li><a href="#accessible-tables-best-practices" id="toc-accessible-tables-best-practices" class="nav-link" data-scroll-target="#accessible-tables-best-practices">Accessible Tables Best Practices</a></li>
+  </ul></li>
+  <li><a href="#conclusion" id="toc-conclusion" class="nav-link" data-scroll-target="#conclusion">Conclusion</a></li>
+  </ul>
+</nav>
+</div>
+<main class="content" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default">
+<div class="quarto-title">
+<h1 class="title">Guidelines for Using Quarto Markdown</h1>
+</div>
+
+<div>
+  <div class="description">
+    This document provides guidelines for using Quarto Markdown in HTML for web crawling.
+  </div>
+</div>
+
+
+<div class="quarto-title-meta">
+
+    <div>
+    <div class="quarto-title-meta-heading">Author</div>
+    <div class="quarto-title-meta-contents">
+             <p>Ayan Chatterjee, NILU DIGITAL </p>
+          </div>
+  </div>
+    
+    <div>
+    <div class="quarto-title-meta-heading">Published</div>
+    <div class="quarto-title-meta-contents">
+      <p class="date">October 30, 2024</p>
+    </div>
+  </div>
+  
+    
+  </div>
+  
+
+<div>
+  <div class="keywords">
+    <div class="block-title">Keywords</div>
+    <p>SEO, web crawling, Quarto Markdown, HTML</p>
+  </div>
+</div>
+
+</header>
+
+
+<section id="quarto-markdown-configuration-for-multiple-output-formats" class="level1">
+<h1>Quarto Markdown Configuration for Multiple Output Formats</h1>
+<p>Add the following YAML configuration to the top of the .qmd file to enable multiple output formats, such as html, pdf, and docx:</p>
+<div class="sourceCode" id="cb1"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="fu">title</span><span class="kw">:</span><span class="at"> </span><span class="st">"Guidelines for Using Quarto Markdown in HTML for Web Crawling"</span></span>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="fu">author</span><span class="kw">:</span><span class="at"> </span><span class="st">"Your Name"</span></span>
+<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="fu">date</span><span class="kw">:</span><span class="at"> </span><span class="st">"2024-10-08"</span></span>
+<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="fu">format</span><span class="kw">:</span></span>
+<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">html</span><span class="kw">:</span><span class="at"> </span></span>
+<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">toc</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span><span class="co">              # Include a Table of Contents</span></span>
+<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">toc-title</span><span class="kw">:</span><span class="at"> </span><span class="st">"Contents"</span></span>
+<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">toc-depth</span><span class="kw">:</span><span class="at"> </span><span class="dv">3</span></span>
+<span id="cb1-10"><a href="#cb1-10" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">pdf</span><span class="kw">:</span></span>
+<span id="cb1-11"><a href="#cb1-11" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">toc</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
+<span id="cb1-12"><a href="#cb1-12" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">toc-depth</span><span class="kw">:</span><span class="at"> </span><span class="dv">3</span></span>
+<span id="cb1-13"><a href="#cb1-13" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">docx</span><span class="kw">:</span></span>
+<span id="cb1-14"><a href="#cb1-14" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">toc</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
+<span id="cb1-15"><a href="#cb1-15" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">toc-depth</span><span class="kw">:</span><span class="at"> </span><span class="dv">3</span></span>
+<span id="cb1-16"><a href="#cb1-16" aria-hidden="true" tabindex="-1"></a><span class="fu">sitemap</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span><span class="co">               # Enable sitemap generation for web crawlers</span></span>
+<span id="cb1-17"><a href="#cb1-17" aria-hidden="true" tabindex="-1"></a><span class="fu">keywords</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"SEO"</span><span class="kw">,</span><span class="at"> </span><span class="st">"web crawling"</span><span class="kw">,</span><span class="at"> </span><span class="st">"Quarto Markdown"</span><span class="kw">,</span><span class="at"> </span><span class="st">"HTML"</span><span class="kw">]</span></span>
+<span id="cb1-18"><a href="#cb1-18" aria-hidden="true" tabindex="-1"></a><span class="fu">description</span><span class="kw">:</span><span class="at"> </span><span class="st">"This document provides guidelines for using Quarto Markdown in HTML for web crawling."</span></span>
+<span id="cb1-19"><a href="#cb1-19" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>The <em>toc</em> option enables a Table of Contents for each specified output type, making navigation easier for longer documents. The <em>toc-title</em> option allows to set a custom title for the Table of Contents, which is especially useful for <em>HTML output</em>. Additionally, the <em>toc-depth</em> option controls the level of headings included in the Table of Contents, allowing to specify how detailed the outline should be, based on the document’s heading hierarchy.</p>
+<p>The YAML header includes metadata that is critical for <em>Search Engine Optimization (SEO)</em> and <em>web crawling</em>.</p>
+<div class="sourceCode" id="cb2"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span>
+<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="fu">title</span><span class="kw">:</span><span class="at"> </span><span class="st">"Optimized Web Crawling Document"</span></span>
+<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a><span class="fu">author</span><span class="kw">:</span><span class="at"> </span><span class="st">"Your Name"</span></span>
+<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a><span class="fu">date</span><span class="kw">:</span><span class="at"> </span><span class="st">"2024-10-08"</span></span>
+<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a><span class="fu">format</span><span class="kw">:</span><span class="at"> html</span></span>
+<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a><span class="fu">sitemap</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span><span class="co">       # Enable sitemap generation for web crawlers</span></span>
+<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a><span class="fu">toc</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span><span class="co">           # Include a Table of Contents for better navigation</span></span>
+<span id="cb2-8"><a href="#cb2-8" aria-hidden="true" tabindex="-1"></a><span class="fu">toc-title</span><span class="kw">:</span><span class="at"> </span><span class="st">"Contents"</span></span>
+<span id="cb2-9"><a href="#cb2-9" aria-hidden="true" tabindex="-1"></a><span class="fu">toc-depth</span><span class="kw">:</span><span class="at"> </span><span class="dv">3</span><span class="co">        # Set TOC depth to include up to h3 headings</span></span>
+<span id="cb2-10"><a href="#cb2-10" aria-hidden="true" tabindex="-1"></a><span class="fu">keywords</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"SEO"</span><span class="kw">,</span><span class="at"> </span><span class="st">"web crawling"</span><span class="kw">,</span><span class="at"> </span><span class="st">"Quarto Markdown"</span><span class="kw">,</span><span class="at"> </span><span class="st">"HTML"</span><span class="kw">]</span></span>
+<span id="cb2-11"><a href="#cb2-11" aria-hidden="true" tabindex="-1"></a><span class="fu">description</span><span class="kw">:</span><span class="at"> </span><span class="st">"This document provides guidelines for using Quarto Markdown in HTML for web crawling."</span></span>
+<span id="cb2-12"><a href="#cb2-12" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="quarto-markdown-in-html-for-web-crawling" class="level1">
+<h1>Quarto Markdown in HTML for Web Crawling</h1>
+<p><strong>Purpose</strong>: This document provides guidelines on using Quarto Markdown to create HTML files optimized for web crawling. These steps and syntaxes will help you structure content, enhance SEO, and improve discoverability of your pages.</p>
+<section id="prerequisites" class="level2">
+<h2 class="anchored" data-anchor-id="prerequisites">Prerequisites</h2>
+<ol type="1">
+<li><strong>Install RStudio</strong>: Download and install RStudio from <a href="https://posit.co/download/rstudio-desktop/">RStudio Download</a>.</li>
+<li><strong>Install Quarto</strong>: Follow <a href="https://quarto.org/docs/get-started/">Quarto installation</a> to install the Quarto CLI.</li>
+</ol>
+</section>
+<section id="basic-setup-in-rstudio" class="level2">
+<h2 class="anchored" data-anchor-id="basic-setup-in-rstudio">Basic Setup in RStudio</h2>
+<ol type="1">
+<li><strong>Create a New Quarto Document</strong>:
+<ul>
+<li>In RStudio, go to <strong>File &gt; New File &gt; Quarto Document</strong>.</li>
+<li>Choose the type of document (e.g., HTML) and enter your title and metadata in the YAML header.</li>
+</ul></li>
+<li><strong>Save the File</strong>:
+<ul>
+<li>Save the file with a <code>.qmd</code> extension to ensure it is treated as a Quarto Markdown file.</li>
+</ul></li>
+<li><strong>YAML Header Configuration</strong>:
+<ul>
+<li>Configure the YAML header with essential metadata to optimize the document for web crawling.</li>
+</ul></li>
+</ol>
+</section>
+</section>
+<section id="essential-html-structure" class="level1">
+<h1>Essential HTML Structure</h1>
+<section id="title-and-meta-description" class="level2">
+<h2 class="anchored" data-anchor-id="title-and-meta-description">Title and Meta Description</h2>
+<p>Define an appropriate title and meta description in the YAML header, as these are essential for search engines.</p>
+<div class="sourceCode" id="cb3"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span>
+<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="fu">title</span><span class="kw">:</span><span class="at"> </span><span class="st">"Guide to Quarto Markdown for SEO"</span></span>
+<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a><span class="fu">description</span><span class="kw">:</span><span class="at"> </span><span class="st">"Learn how to use Quarto Markdown to create SEO-optimized HTML content for web crawling."</span></span>
+<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="headings-and-subheadings" class="level2">
+<h2 class="anchored" data-anchor-id="headings-and-subheadings">Headings and Subheadings</h2>
+<p>Organize content using structured headings (#, ##, ###) to create a hierarchy. This helps crawlers understand the structure and prioritize content.</p>
+<div class="sourceCode" id="cb4"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span>
+<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a><span class="co"># Main Heading</span></span>
+<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a><span class="co">## Subheading 1</span></span>
+<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a><span class="at">Content here.</span></span>
+<span id="cb4-7"><a href="#cb4-7" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-8"><a href="#cb4-8" aria-hidden="true" tabindex="-1"></a><span class="co">### Subheading 1.1</span></span>
+<span id="cb4-9"><a href="#cb4-9" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-10"><a href="#cb4-10" aria-hidden="true" tabindex="-1"></a><span class="at">More content here.</span></span>
+<span id="cb4-11"><a href="#cb4-11" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="linking-structure" class="level2">
+<h2 class="anchored" data-anchor-id="linking-structure">Linking Structure</h2>
+<p>Use descriptive anchor text for links and ensure that internal links are present to improve navigation.</p>
+<div class="sourceCode" id="cb5"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span>
+<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a><span class="at">For more details, refer to the [Introduction to SEO](</span><span class="co">#introduction-to-seo).</span></span>
+<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="image-alt-text-and-descriptions" class="level2">
+<h2 class="anchored" data-anchor-id="image-alt-text-and-descriptions">Image Alt Text and Descriptions</h2>
+<p>Add meaningful alt text to images to improve accessibility and indexing by search engines.</p>
+<div class="sourceCode" id="cb6"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span>
+<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a><span class="at">![SEO Process](images/seo_process.png){alt="Diagram showing the process of SEO optimization"}</span></span>
+<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="tables-in-quarto-markdown" class="level2">
+<h2 class="anchored" data-anchor-id="tables-in-quarto-markdown">Tables in Quarto Markdown</h2>
+<table class="caption-top table">
+<caption>Table 1: Simple Table Example</caption>
+<thead>
+<tr class="header">
+<th>Column 1</th>
+<th>Column 2</th>
+<th>Column 3</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td>Row 1, Cell 1</td>
+<td>Row 1, Cell 2</td>
+<td>Row 1, Cell 3</td>
+</tr>
+<tr class="even">
+<td>Row 2, Cell 1</td>
+<td>Row 2, Cell 2</td>
+<td>Row 2, Cell 3</td>
+</tr>
+</tbody>
+</table>
+<table class="caption-top table">
+<caption>Table 2: Aligned Table</caption>
+<thead>
+<tr class="header">
+<th style="text-align: left;">Left Align</th>
+<th style="text-align: center;">Center Align</th>
+<th style="text-align: right;">Right Align</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td style="text-align: left;">Row 1, Cell 1</td>
+<td style="text-align: center;">Row 1, Cell 2</td>
+<td style="text-align: right;">Row 1, Cell 3</td>
+</tr>
+<tr class="even">
+<td style="text-align: left;">Row 2, Cell 1</td>
+<td style="text-align: center;">Row 2, Cell 2</td>
+<td style="text-align: right;">Row 2, Cell 3</td>
+</tr>
+</tbody>
+</table>
+<table class="caption-top table">
+<caption>Table 3: Grid Table Syntax</caption>
+<colgroup>
+<col style="width: 22%">
+<col style="width: 22%">
+<col style="width: 22%">
+</colgroup>
+<thead>
+<tr class="header">
+<th>Column 1</th>
+<th>Column 2</th>
+<th>Column 3</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td>Row 1, Cell 1</td>
+<td>Row 1, Cell 2</td>
+<td>Row 1, Cell 3</td>
+</tr>
+<tr class="even">
+<td>Row 2, Cell 1</td>
+<td>Row 2, Cell 2</td>
+<td>Row 2, Cell 3</td>
+</tr>
+</tbody>
+</table>
+</section>
+<section id="complex-table-with-row-and-column-spans" class="level2">
+<h2 class="anchored" data-anchor-id="complex-table-with-row-and-column-spans">Complex Table with Row and Column Spans</h2>
+<div class="sourceCode" id="cb7"><pre class="sourceCode html code-with-copy"><code class="sourceCode html"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="dt">&lt;</span><span class="kw">table</span><span class="dt">&gt;</span></span>
+<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a>  <span class="dt">&lt;</span><span class="kw">tr</span><span class="dt">&gt;</span></span>
+<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a>    <span class="dt">&lt;</span><span class="kw">th</span><span class="ot"> rowspan</span><span class="op">=</span><span class="st">"2"</span><span class="dt">&gt;</span>Column 1<span class="dt">&lt;/</span><span class="kw">th</span><span class="dt">&gt;</span></span>
+<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a>    <span class="dt">&lt;</span><span class="kw">th</span><span class="dt">&gt;</span>Column 2<span class="dt">&lt;/</span><span class="kw">th</span><span class="dt">&gt;</span></span>
+<span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a>    <span class="dt">&lt;</span><span class="kw">th</span><span class="dt">&gt;</span>Column 3<span class="dt">&lt;/</span><span class="kw">th</span><span class="dt">&gt;</span></span>
+<span id="cb7-6"><a href="#cb7-6" aria-hidden="true" tabindex="-1"></a>  <span class="dt">&lt;/</span><span class="kw">tr</span><span class="dt">&gt;</span></span>
+<span id="cb7-7"><a href="#cb7-7" aria-hidden="true" tabindex="-1"></a>  <span class="dt">&lt;</span><span class="kw">tr</span><span class="dt">&gt;</span></span>
+<span id="cb7-8"><a href="#cb7-8" aria-hidden="true" tabindex="-1"></a>    <span class="dt">&lt;</span><span class="kw">td</span><span class="ot"> colspan</span><span class="op">=</span><span class="st">"2"</span><span class="dt">&gt;</span>Spanning across 2 columns<span class="dt">&lt;/</span><span class="kw">td</span><span class="dt">&gt;</span></span>
+<span id="cb7-9"><a href="#cb7-9" aria-hidden="true" tabindex="-1"></a>  <span class="dt">&lt;/</span><span class="kw">tr</span><span class="dt">&gt;</span></span>
+<span id="cb7-10"><a href="#cb7-10" aria-hidden="true" tabindex="-1"></a><span class="dt">&lt;/</span><span class="kw">table</span><span class="dt">&gt;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+</section>
+<section id="html-sitemap-generation-for-web-crawling" class="level1">
+<h1>HTML Sitemap Generation for Web Crawling</h1>
+<p>Enabling the sitemap option in the YAML header creates a sitemap automatically. This sitemap file helps web crawlers discover and index all relevant pages.</p>
+<section id="sample-sitemap-configuration" class="level2">
+<h2 class="anchored" data-anchor-id="sample-sitemap-configuration">Sample Sitemap Configuration</h2>
+<p>The automatically generated sitemap.xml file might contain entries like the following:</p>
+<div class="sourceCode" id="cb8"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span>
+<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a><span class="at">&lt;url&gt;</span></span>
+<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a><span class="at">    &lt;loc&gt;https://&lt;your-username&gt;.github.io/&lt;your-repo-name&gt;/index.html&lt;/loc&gt;</span></span>
+<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a><span class="at">    &lt;lastmod&gt;2024-10-08T12:24:05Z&lt;/lastmod&gt;</span></span>
+<span id="cb8-5"><a href="#cb8-5" aria-hidden="true" tabindex="-1"></a><span class="at">    &lt;changefreq&gt;monthly&lt;/changefreq&gt;</span></span>
+<span id="cb8-6"><a href="#cb8-6" aria-hidden="true" tabindex="-1"></a><span class="at">    &lt;priority&gt;0.8&lt;/priority&gt;</span></span>
+<span id="cb8-7"><a href="#cb8-7" aria-hidden="true" tabindex="-1"></a><span class="at">&lt;/url&gt;</span></span>
+<span id="cb8-8"><a href="#cb8-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb8-9"><a href="#cb8-9" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="customizing-sitemap" class="level2">
+<h2 class="anchored" data-anchor-id="customizing-sitemap">Customizing Sitemap</h2>
+<p>To further customize, use the sitemap: attribute directly in the YAML header to control which pages are included or to add specific pages manually.</p>
+</section>
+<section id="additional-metadata-for-social-media-and-crawlers" class="level2">
+<h2 class="anchored" data-anchor-id="additional-metadata-for-social-media-and-crawlers">Additional Metadata for Social Media and Crawlers</h2>
+<p>Add Open Graph (og:) and Twitter metadata tags for better social media sharing and visibility.</p>
+<div class="sourceCode" id="cb9"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span>
+<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a><span class="fu">meta</span><span class="kw">:</span></span>
+<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="kw">-</span><span class="at"> </span><span class="fu">name</span><span class="kw">:</span><span class="at"> </span><span class="st">"twitter:card"</span></span>
+<span id="cb9-4"><a href="#cb9-4" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">content</span><span class="kw">:</span><span class="at"> </span><span class="st">"summary"</span></span>
+<span id="cb9-5"><a href="#cb9-5" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="kw">-</span><span class="at"> </span><span class="fu">name</span><span class="kw">:</span><span class="at"> </span><span class="st">"twitter:title"</span></span>
+<span id="cb9-6"><a href="#cb9-6" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">content</span><span class="kw">:</span><span class="at"> </span><span class="st">"Guide to Quarto Markdown for SEO"</span></span>
+<span id="cb9-7"><a href="#cb9-7" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="kw">-</span><span class="at"> </span><span class="fu">name</span><span class="kw">:</span><span class="at"> </span><span class="st">"twitter:description"</span></span>
+<span id="cb9-8"><a href="#cb9-8" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">content</span><span class="kw">:</span><span class="at"> </span><span class="st">"This guide helps you create SEO-optimized HTML content using Quarto Markdown."</span></span>
+<span id="cb9-9"><a href="#cb9-9" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="kw">-</span><span class="at"> </span><span class="fu">property</span><span class="kw">:</span><span class="at"> </span><span class="st">"og:title"</span></span>
+<span id="cb9-10"><a href="#cb9-10" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">content</span><span class="kw">:</span><span class="at"> </span><span class="st">"Quarto Markdown for Web Crawling"</span></span>
+<span id="cb9-11"><a href="#cb9-11" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="kw">-</span><span class="at"> </span><span class="fu">property</span><span class="kw">:</span><span class="at"> </span><span class="st">"og:description"</span></span>
+<span id="cb9-12"><a href="#cb9-12" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">content</span><span class="kw">:</span><span class="at"> </span><span class="st">"Optimize HTML content for search engines using Quarto Markdown in RStudio."</span></span>
+<span id="cb9-13"><a href="#cb9-13" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+</section>
+<section id="quarto-syntax-for-key-seo-components" class="level1">
+<h1>Quarto Syntax for Key SEO Components</h1>
+<section id="structured-data-with-json-ld" class="level2">
+<h2 class="anchored" data-anchor-id="structured-data-with-json-ld">Structured Data with JSON-LD</h2>
+<p>Use structured data like JSON-LD to help search engines understand the context of your content.</p>
+<div class="sourceCode" id="cb10"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span>
+<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a><span class="kw">{</span></span>
+<span id="cb10-3"><a href="#cb10-3" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">"@context"</span><span class="kw">:</span><span class="at"> </span><span class="st">"https://schema.org"</span><span class="kw">,</span></span>
+<span id="cb10-4"><a href="#cb10-4" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">"@type"</span><span class="kw">:</span><span class="at"> </span><span class="st">"Article"</span><span class="kw">,</span></span>
+<span id="cb10-5"><a href="#cb10-5" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">"headline"</span><span class="kw">:</span><span class="at"> </span><span class="st">"Guide to Quarto Markdown for SEO"</span><span class="kw">,</span></span>
+<span id="cb10-6"><a href="#cb10-6" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">"datePublished"</span><span class="kw">:</span><span class="at"> </span><span class="st">"2024-10-08"</span><span class="kw">,</span></span>
+<span id="cb10-7"><a href="#cb10-7" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">"author"</span><span class="kw">:</span><span class="at"> </span><span class="kw">{</span></span>
+<span id="cb10-8"><a href="#cb10-8" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">"@type"</span><span class="kw">:</span><span class="at"> </span><span class="st">"Person"</span><span class="kw">,</span></span>
+<span id="cb10-9"><a href="#cb10-9" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">"name"</span><span class="kw">:</span><span class="at"> </span><span class="st">"Your Name"</span></span>
+<span id="cb10-10"><a href="#cb10-10" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="kw">},</span></span>
+<span id="cb10-11"><a href="#cb10-11" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">"keywords"</span><span class="kw">:</span><span class="at"> </span><span class="st">"SEO, web crawling, Quarto Markdown, HTML"</span></span>
+<span id="cb10-12"><a href="#cb10-12" aria-hidden="true" tabindex="-1"></a><span class="kw">}</span></span>
+<span id="cb10-13"><a href="#cb10-13" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="linking-external-stylesheets-and-javascript" class="level2">
+<h2 class="anchored" data-anchor-id="linking-external-stylesheets-and-javascript">Linking External Stylesheets and JavaScript</h2>
+<p>For advanced functionality, link to external CSS and JS files. This enhances the user experience without compromising SEO.</p>
+<div class="sourceCode" id="cb11"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span>
+<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a><span class="at">&lt;link rel="stylesheet" href="https://your-stylesheet-url.css"&gt;</span></span>
+<span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a><span class="at">&lt;script src="https://your-script-url.js"&gt;&lt;/script&gt;</span></span>
+<span id="cb11-4"><a href="#cb11-4" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+</section>
+<section id="rendering-the-quarto-document-in-html" class="level1">
+<h1>Rendering the Quarto Document in HTML</h1>
+<p>Once created the .qmd file, render it in HTML:</p>
+<section id="render-in-rstudio" class="level2">
+<h2 class="anchored" data-anchor-id="render-in-rstudio">Render in RStudio</h2>
+<p>Go to the RStudio Terminal or Console and run:</p>
+<div class="sourceCode" id="cb12"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="ex">quarto</span> render yourfile.qmd</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>HTML output: quarto render yourfile.qmd –to html PDF output: quarto render yourfile.qmd –to pdf DOCX output: quarto render yourfile.qmd –to docx</p>
+<p>Note: PDF output requires a LaTeX installation. If you haven’t installed LaTeX, you can use a lightweight distribution like <strong>TinyTeX</strong> (recommended for R users) or a full installation like <strong>MiKTeX</strong> or <strong>TeX Live</strong>. To install TinyTeX, run:</p>
+<div class="sourceCode" id="cb13"><pre class="sourceCode r code-with-copy"><code class="sourceCode r"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="fu">install.packages</span>(<span class="st">"tinytex"</span>)</span>
+<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a>tinytex<span class="sc">::</span><span class="fu">install_tinytex</span>()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>We can customize DOCX output with a reference DOCX file by adding reference-doc in the docx configuration.</p>
+</section>
+<section id="preview-in-browser" class="level2">
+<h2 class="anchored" data-anchor-id="preview-in-browser">Preview in Browser</h2>
+<p>Open the generated HTML file in a browser to ensure the content is well-structured for web crawling and SEO.</p>
+</section>
+</section>
+<section id="best-practices-checklist-for-accessible-and-seo-optimized-documents" class="level1">
+<h1>Best Practices Checklist for Accessible and SEO-Optimized Documents</h1>
+<p>This checklist ensures that your Quarto Markdown documents and tables are optimized for accessibility, SEO, and readability across multiple formats (HTML, PDF, DOCX).</p>
+<section id="general-document-best-practices" class="level2">
+<h2 class="anchored" data-anchor-id="general-document-best-practices">General Document Best Practices</h2>
+<ul class="task-list">
+<li><label><input type="checkbox"><strong>Use descriptive titles and meta descriptions</strong>: Ensure your document has a concise, relevant title and meta description to enhance SEO and search engine visibility.</label></li>
+<li><label><input type="checkbox"><strong>Include headings with a logical hierarchy</strong>: Organize your document with clear headings (<code>#</code>, <code>##</code>, <code>###</code>) to create a structure that aids readability and search engine indexing.</label></li>
+<li><label><input type="checkbox"><strong>Add alt text for images</strong>: For accessibility, all images should have meaningful <code>alt</code> text describing their content.</label></li>
+<li><label><input type="checkbox"><strong>Enable the sitemap in the YAML header</strong>: Set <code>sitemap: true</code> in the YAML header to improve web crawling and search engine indexing.</label></li>
+<li><label><input type="checkbox"><strong>Use Open Graph and Twitter metadata for social sharing</strong>: Add <code>og:</code> and <code>twitter:</code> tags in the YAML header to optimize content for social media platforms.</label></li>
+<li><label><input type="checkbox"><strong>Include structured data (JSON-LD) for enhanced search engine understanding</strong>: Structured data provides additional context, improving the relevance and accuracy of search engine results.</label></li>
+<li><label><input type="checkbox"><strong>Link to relevant internal and external pages</strong>: Use descriptive anchor text for links to help guide users and improve SEO.</label></li>
+</ul>
+</section>
+<section id="accessible-tables-best-practices" class="level2">
+<h2 class="anchored" data-anchor-id="accessible-tables-best-practices">Accessible Tables Best Practices</h2>
+<ul class="task-list">
+<li><label><input type="checkbox"><strong>Use descriptive headers</strong>: Ensure table headers are clear and descriptive, which aids comprehension and accessibility for screen readers.</label></li>
+<li><label><input type="checkbox"><strong>Add captions to tables</strong>: Every table should include a caption that summarizes its content to provide context for readers and improve accessibility.</label></li>
+<li><label><input type="checkbox"><strong>Keep tables simple</strong>: Avoid complex structures with excessive row/column spans unless absolutely necessary for data clarity.</label></li>
+<li><label><input type="checkbox"><strong>Provide alt text for images within tables</strong>: If images are used within table cells, ensure each image has descriptive <code>alt</code> text for accessibility.</label></li>
+<li><label><input type="checkbox"><strong>Use semantic HTML for complex tables</strong>: For tables with intricate structures, consider using HTML with ARIA roles (e.g., <code>role="table"</code>) to improve accessibility and aid screen readers.</label></li>
+</ul>
+<p>Following these guidelines will ensure Quarto Markdown documents and tables are accessible, SEO-optimized, and suitable for multiple output formats.</p>
+</section>
+</section>
+<section id="conclusion" class="level1">
+<h1>Conclusion</h1>
+<p>This Quarto Markdown document can be saved with a <code>.qmd</code> extension, edited in RStudio, and rendered to HTML to ensure it follows best practices for web crawling.</p>
+</section>
+
+</main>
+<!-- /main column -->
+<script id="quarto-html-after-body" type="application/javascript">
+window.document.addEventListener("DOMContentLoaded", function (event) {
+  const toggleBodyColorMode = (bsSheetEl) => {
+    const mode = bsSheetEl.getAttribute("data-mode");
+    const bodyEl = window.document.querySelector("body");
+    if (mode === "dark") {
+      bodyEl.classList.add("quarto-dark");
+      bodyEl.classList.remove("quarto-light");
+    } else {
+      bodyEl.classList.add("quarto-light");
+      bodyEl.classList.remove("quarto-dark");
+    }
+  }
+  const toggleBodyColorPrimary = () => {
+    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
+    if (bsSheetEl) {
+      toggleBodyColorMode(bsSheetEl);
+    }
+  }
+  toggleBodyColorPrimary();  
+  const icon = "";
+  const anchorJS = new window.AnchorJS();
+  anchorJS.options = {
+    placement: 'right',
+    icon: icon
+  };
+  anchorJS.add('.anchored');
+  const isCodeAnnotation = (el) => {
+    for (const clz of el.classList) {
+      if (clz.startsWith('code-annotation-')) {                     
+        return true;
+      }
+    }
+    return false;
+  }
+  const onCopySuccess = function(e) {
+    // button target
+    const button = e.trigger;
+    // don't keep focus
+    button.blur();
+    // flash "checked"
+    button.classList.add('code-copy-button-checked');
+    var currentTitle = button.getAttribute("title");
+    button.setAttribute("title", "Copied!");
+    let tooltip;
+    if (window.bootstrap) {
+      button.setAttribute("data-bs-toggle", "tooltip");
+      button.setAttribute("data-bs-placement", "left");
+      button.setAttribute("data-bs-title", "Copied!");
+      tooltip = new bootstrap.Tooltip(button, 
+        { trigger: "manual", 
+          customClass: "code-copy-button-tooltip",
+          offset: [0, -8]});
+      tooltip.show();    
+    }
+    setTimeout(function() {
+      if (tooltip) {
+        tooltip.hide();
+        button.removeAttribute("data-bs-title");
+        button.removeAttribute("data-bs-toggle");
+        button.removeAttribute("data-bs-placement");
+      }
+      button.setAttribute("title", currentTitle);
+      button.classList.remove('code-copy-button-checked');
+    }, 1000);
+    // clear code selection
+    e.clearSelection();
+  }
+  const getTextToCopy = function(trigger) {
+      const codeEl = trigger.previousElementSibling.cloneNode(true);
+      for (const childEl of codeEl.children) {
+        if (isCodeAnnotation(childEl)) {
+          childEl.remove();
+        }
+      }
+      return codeEl.innerText;
+  }
+  const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+    text: getTextToCopy
+  });
+  clipboard.on('success', onCopySuccess);
+  if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+    // For code content inside modals, clipBoardJS needs to be initialized with a container option
+    // TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
+    const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+      text: getTextToCopy,
+      container: window.document.getElementById('quarto-embedded-source-code-modal')
+    });
+    clipboardModal.on('success', onCopySuccess);
+  }
+    var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+    var mailtoRegex = new RegExp(/^mailto:/);
+      var filterRegex = new RegExp('/' + window.location.host + '/');
+    var isInternal = (href) => {
+        return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+    }
+    // Inspect non-navigation links and adorn them if external
+ 	var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+    for (var i=0; i<links.length; i++) {
+      const link = links[i];
+      if (!isInternal(link.href)) {
+        // undo the damage that might have been done by quarto-nav.js in the case of
+        // links that we want to consider external
+        if (link.dataset.originalHref !== undefined) {
+          link.href = link.dataset.originalHref;
+        }
+      }
+    }
+  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+    const config = {
+      allowHTML: true,
+      maxWidth: 500,
+      delay: 100,
+      arrow: false,
+      appendTo: function(el) {
+          return el.parentElement;
+      },
+      interactive: true,
+      interactiveBorder: 10,
+      theme: 'quarto',
+      placement: 'bottom-start',
+    };
+    if (contentFn) {
+      config.content = contentFn;
+    }
+    if (onTriggerFn) {
+      config.onTrigger = onTriggerFn;
+    }
+    if (onUntriggerFn) {
+      config.onUntrigger = onUntriggerFn;
+    }
+    window.tippy(el, config); 
+  }
+  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+  for (var i=0; i<noterefs.length; i++) {
+    const ref = noterefs[i];
+    tippyHover(ref, function() {
+      // use id or data attribute instead here
+      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+      try { href = new URL(href).hash; } catch {}
+      const id = href.replace(/^#\/?/, "");
+      const note = window.document.getElementById(id);
+      if (note) {
+        return note.innerHTML;
+      } else {
+        return "";
+      }
+    });
+  }
+  const xrefs = window.document.querySelectorAll('a.quarto-xref');
+  const processXRef = (id, note) => {
+    // Strip column container classes
+    const stripColumnClz = (el) => {
+      el.classList.remove("page-full", "page-columns");
+      if (el.children) {
+        for (const child of el.children) {
+          stripColumnClz(child);
+        }
+      }
+    }
+    stripColumnClz(note)
+    if (id === null || id.startsWith('sec-')) {
+      // Special case sections, only their first couple elements
+      const container = document.createElement("div");
+      if (note.children && note.children.length > 2) {
+        container.appendChild(note.children[0].cloneNode(true));
+        for (let i = 1; i < note.children.length; i++) {
+          const child = note.children[i];
+          if (child.tagName === "P" && child.innerText === "") {
+            continue;
+          } else {
+            container.appendChild(child.cloneNode(true));
+            break;
+          }
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(container);
+        }
+        return container.innerHTML
+      } else {
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        return note.innerHTML;
+      }
+    } else {
+      // Remove any anchor links if they are present
+      const anchorLink = note.querySelector('a.anchorjs-link');
+      if (anchorLink) {
+        anchorLink.remove();
+      }
+      if (window.Quarto?.typesetMath) {
+        window.Quarto.typesetMath(note);
+      }
+      // TODO in 1.5, we should make sure this works without a callout special case
+      if (note.classList.contains("callout")) {
+        return note.outerHTML;
+      } else {
+        return note.innerHTML;
+      }
+    }
+  }
+  for (var i=0; i<xrefs.length; i++) {
+    const xref = xrefs[i];
+    tippyHover(xref, undefined, function(instance) {
+      instance.disable();
+      let url = xref.getAttribute('href');
+      let hash = undefined; 
+      if (url.startsWith('#')) {
+        hash = url;
+      } else {
+        try { hash = new URL(url).hash; } catch {}
+      }
+      if (hash) {
+        const id = hash.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note !== null) {
+          try {
+            const html = processXRef(id, note.cloneNode(true));
+            instance.setContent(html);
+          } finally {
+            instance.enable();
+            instance.show();
+          }
+        } else {
+          // See if we can fetch this
+          fetch(url.split('#')[0])
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.getElementById(id);
+            if (note !== null) {
+              const html = processXRef(id, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      } else {
+        // See if we can fetch a full url (with no hash to target)
+        // This is a special case and we should probably do some content thinning / targeting
+        fetch(url)
+        .then(res => res.text())
+        .then(html => {
+          const parser = new DOMParser();
+          const htmlDoc = parser.parseFromString(html, "text/html");
+          const note = htmlDoc.querySelector('main.content');
+          if (note !== null) {
+            // This should only happen for chapter cross references
+            // (since there is no id in the URL)
+            // remove the first header
+            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+              note.children[0].remove();
+            }
+            const html = processXRef(null, note);
+            instance.setContent(html);
+          } 
+        }).finally(() => {
+          instance.enable();
+          instance.show();
+        });
+      }
+    }, function(instance) {
+    });
+  }
+      let selectedAnnoteEl;
+      const selectorForAnnotation = ( cell, annotation) => {
+        let cellAttr = 'data-code-cell="' + cell + '"';
+        let lineAttr = 'data-code-annotation="' +  annotation + '"';
+        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+        return selector;
+      }
+      const selectCodeLines = (annoteEl) => {
+        const doc = window.document;
+        const targetCell = annoteEl.getAttribute("data-target-cell");
+        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+        const lineIds = lines.map((line) => {
+          return targetCell + "-" + line;
+        })
+        let top = null;
+        let height = null;
+        let parent = null;
+        if (lineIds.length > 0) {
+            //compute the position of the single el (top and bottom and make a div)
+            const el = window.document.getElementById(lineIds[0]);
+            top = el.offsetTop;
+            height = el.offsetHeight;
+            parent = el.parentElement.parentElement;
+          if (lineIds.length > 1) {
+            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+            height = bottom - top;
+          }
+          if (top !== null && height !== null && parent !== null) {
+            // cook up a div (if necessary) and position it 
+            let div = window.document.getElementById("code-annotation-line-highlight");
+            if (div === null) {
+              div = window.document.createElement("div");
+              div.setAttribute("id", "code-annotation-line-highlight");
+              div.style.position = 'absolute';
+              parent.appendChild(div);
+            }
+            div.style.top = top - 2 + "px";
+            div.style.height = height + 4 + "px";
+            div.style.left = 0;
+            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+            if (gutterDiv === null) {
+              gutterDiv = window.document.createElement("div");
+              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+              gutterDiv.style.position = 'absolute';
+              const codeCell = window.document.getElementById(targetCell);
+              const gutter = codeCell.querySelector('.code-annotation-gutter');
+              gutter.appendChild(gutterDiv);
+            }
+            gutterDiv.style.top = top - 2 + "px";
+            gutterDiv.style.height = height + 4 + "px";
+          }
+          selectedAnnoteEl = annoteEl;
+        }
+      };
+      const unselectCodeLines = () => {
+        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+        elementsIds.forEach((elId) => {
+          const div = window.document.getElementById(elId);
+          if (div) {
+            div.remove();
+          }
+        });
+        selectedAnnoteEl = undefined;
+      };
+        // Handle positioning of the toggle
+    window.addEventListener(
+      "resize",
+      throttle(() => {
+        elRect = undefined;
+        if (selectedAnnoteEl) {
+          selectCodeLines(selectedAnnoteEl);
+        }
+      }, 10)
+    );
+    function throttle(fn, ms) {
+    let throttle = false;
+    let timer;
+      return (...args) => {
+        if(!throttle) { // first call gets through
+            fn.apply(this, args);
+            throttle = true;
+        } else { // all the others get throttled
+            if(timer) clearTimeout(timer); // cancel #2
+            timer = setTimeout(() => {
+              fn.apply(this, args);
+              timer = throttle = false;
+            }, ms);
+        }
+      };
+    }
+      // Attach click handler to the DT
+      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+      for (const annoteDlNode of annoteDls) {
+        annoteDlNode.addEventListener('click', (event) => {
+          const clickedEl = event.target;
+          if (clickedEl !== selectedAnnoteEl) {
+            unselectCodeLines();
+            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+            if (activeEl) {
+              activeEl.classList.remove('code-annotation-active');
+            }
+            selectCodeLines(clickedEl);
+            clickedEl.classList.add('code-annotation-active');
+          } else {
+            // Unselect the line
+            unselectCodeLines();
+            clickedEl.classList.remove('code-annotation-active');
+          }
+        });
+      }
+  const findCites = (el) => {
+    const parentEl = el.parentElement;
+    if (parentEl) {
+      const cites = parentEl.dataset.cites;
+      if (cites) {
+        return {
+          el,
+          cites: cites.split(' ')
+        };
+      } else {
+        return findCites(el.parentElement)
+      }
+    } else {
+      return undefined;
+    }
+  };
+  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+  for (var i=0; i<bibliorefs.length; i++) {
+    const ref = bibliorefs[i];
+    const citeInfo = findCites(ref);
+    if (citeInfo) {
+      tippyHover(citeInfo.el, function() {
+        var popup = window.document.createElement('div');
+        citeInfo.cites.forEach(function(cite) {
+          var citeDiv = window.document.createElement('div');
+          citeDiv.classList.add('hanging-indent');
+          citeDiv.classList.add('csl-entry');
+          var biblioDiv = window.document.getElementById('ref-' + cite);
+          if (biblioDiv) {
+            citeDiv.innerHTML = biblioDiv.innerHTML;
+          }
+          popup.appendChild(citeDiv);
+        });
+        return popup.innerHTML;
+      });
+    }
+  }
+});
+</script>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/sitemap.xml b/sitemap.xml
new file mode 100644
index 0000000..b7e6a94
--- /dev/null
+++ b/sitemap.xml
@@ -0,0 +1,39 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <url>
+    <loc>https://<your-username>.github.io/<your-repo-name>/CLMS_doc_example.html</loc>
+    <lastmod>2024-11-12T15:26:09Z</lastmod>
+    <changefreq>monthly</changefreq>
+    <priority>0.8</priority>
+  </url>
+  <url>
+    <loc>https://<your-username>.github.io/<your-repo-name>/CLMS_filenamingconvention.html</loc>
+    <lastmod>2024-11-12T15:26:09Z</lastmod>
+    <changefreq>monthly</changefreq>
+    <priority>0.8</priority>
+  </url>
+  <url>
+    <loc>https://<your-username>.github.io/<your-repo-name>/CheatSheet.html</loc>
+    <lastmod>2024-11-12T15:26:09Z</lastmod>
+    <changefreq>monthly</changefreq>
+    <priority>0.8</priority>
+  </url>
+  <url>
+    <loc>https://<your-username>.github.io/<your-repo-name>/README.html</loc>
+    <lastmod>2024-11-12T15:26:09Z</lastmod>
+    <changefreq>monthly</changefreq>
+    <priority>0.8</priority>
+  </url>
+  <url>
+    <loc>https://<your-username>.github.io/<your-repo-name>/clms.html</loc>
+    <lastmod>2024-11-12T15:26:09Z</lastmod>
+    <changefreq>monthly</changefreq>
+    <priority>0.8</priority>
+  </url>
+  <url>
+    <loc>https://<your-username>.github.io/<your-repo-name>/guidelines.html</loc>
+    <lastmod>2024-11-12T15:26:09Z</lastmod>
+    <changefreq>monthly</changefreq>
+    <priority>0.8</priority>
+  </url>
+</urlset>

Tool	Cross-Language Support	Output Formats	Code Integration	Static Site Generation	Ideal Use Case
Quarto	Yes	HTML, PDF, Word	Yes	Yes	Reports, blogs, CLMS docs
R Markdown	R only	HTML, PDF, Word	Yes (R)	No	Statistical reports
Jupyter Notebooks	40+ languages	HTML, PDF	Yes	No	Data Science
LaTeX	Limited	PDF, HTML	No	No	Scientific papers
Hugo	No	HTML	No	Yes	Blogs, websites
Sphinx	Python	HTML, PDF	No	Yes	Python documentation
Category	Field
Main	Producing entity
Main	Theme
Spatial identifier	Resolution
Spatial identifier	Tile
Spatial identifier	Coverage
Temporal identifier	(Acquisition) Date
Production identifier	Platform
Production identifier	Version
Production identifier	Processing date
Parameter identifier	Product
Parameter identifier	Parameter
Column 1	Column 2	Column 3
Row 1, Cell 1	Row 1, Cell 2	Row 1, Cell 3
Row 2, Cell 1	Row 2, Cell 2	Row 2, Cell 3