-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathclms.html
1227 lines (1197 loc) · 111 KB
/
clms.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
<meta charset="utf-8">
<meta name="generator" content="quarto-1.5.56">
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
<meta name="author" content="Ayan Chatterjee, Department of DIGITAL, NILU">
<meta name="dcterms.date" content="2024-09-16">
<meta name="keywords" content="CLMS standards, web crawlers, AI training, information formatting">
<title>Developing CLMS Standards for Generative AI Training and Web Crawlers Using Quarto Markdown and Sitemaps</title>
<style>
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
div.columns{display: flex; gap: min(4vw, 1.5em);}
div.column{flex: auto; overflow-x: auto;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
ul.task-list li input[type="checkbox"] {
width: 0.8em;
margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */
vertical-align: middle;
}
/* CSS for syntax highlighting */
pre > code.sourceCode { white-space: pre; position: relative; }
pre > code.sourceCode > span { line-height: 1.25; }
pre > code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
pre > code.sourceCode { white-space: pre-wrap; }
pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
{ counter-reset: source-line 0; }
pre.numberSource code > span
{ position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
{ content: counter(source-line);
position: relative; left: -1em; text-align: right; vertical-align: baseline;
border: none; display: inline-block;
-webkit-touch-callout: none; -webkit-user-select: none;
-khtml-user-select: none; -moz-user-select: none;
-ms-user-select: none; user-select: none;
padding: 0 4px; width: 4em;
}
pre.numberSource { margin-left: 3em; padding-left: 4px; }
div.sourceCode
{ }
@media screen {
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
/* CSS for citations */
div.csl-bib-body { }
div.csl-entry {
clear: both;
margin-bottom: 0em;
}
.hanging-indent div.csl-entry {
margin-left:2em;
text-indent:-2em;
}
div.csl-left-margin {
min-width:2em;
float:left;
}
div.csl-right-inline {
margin-left:2em;
padding-left:1em;
}
div.csl-indent {
margin-left: 2em;
}</style>
<script src="clms_files/libs/clipboard/clipboard.min.js"></script>
<script src="clms_files/libs/quarto-html/quarto.js"></script>
<script src="clms_files/libs/quarto-html/popper.min.js"></script>
<script src="clms_files/libs/quarto-html/tippy.umd.min.js"></script>
<script src="clms_files/libs/quarto-html/anchor.min.js"></script>
<link href="clms_files/libs/quarto-html/tippy.css" rel="stylesheet">
<link href="clms_files/libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
<script src="clms_files/libs/bootstrap/bootstrap.min.js"></script>
<link href="clms_files/libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
<link href="clms_files/libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
</head>
<body>
<div id="quarto-content" class="page-columns page-rows-contents page-layout-article">
<div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
<div class="quarto-alternate-formats"><h2>Other Formats</h2><ul><li><a href="clms.pdf"><i class="bi bi-file-pdf"></i>PDF</a></li><li><a href="clms.docx"><i class="bi bi-file-word"></i>MS Word</a></li></ul></div></div>
<main class="content" id="quarto-document-content">
<header id="title-block-header" class="quarto-title-block default">
<div class="quarto-title">
<h1 class="title">Developing CLMS Standards for Generative AI Training and Web Crawlers Using Quarto Markdown and Sitemaps</h1>
<p class="subtitle lead">Task 10.1: Information Provisioning for Generative Chatbots</p>
</div>
<div class="quarto-title-meta">
<div>
<div class="quarto-title-meta-heading">Author</div>
<div class="quarto-title-meta-contents">
<p>Ayan Chatterjee, Department of DIGITAL, NILU </p>
</div>
</div>
<div>
<div class="quarto-title-meta-heading">Published</div>
<div class="quarto-title-meta-contents">
<p class="date">September 16, 2024</p>
</div>
</div>
</div>
<div>
<div class="keywords">
<div class="block-title">Keywords</div>
<p>CLMS standards, web crawlers, AI training, information formatting</p>
</div>
</div>
</header>
<div style="font-family: 'Times New Roman', serif; text-align: justify;">
<section id="abstract" class="level1">
<h1>Abstract</h1>
<p>Generative chatbots rely on large amounts of structured data to provide accurate, timely responses to user queries. By developing <strong>Copernicus Land Monitoring Service (CLMS)</strong> standards for information formatting and delivery using <strong>Quarto Markdown</strong> and <strong>sitemaps</strong>, we can ensure that the vast amounts of environmental data in CLMS are accessible to web crawlers and AI models. Using standardized structured content improves discoverability and discoverability of CLMS products and makes it easier for users to access relevant datasets through traditional search engines and generative chatbots.</p>
<p>In addition, by providing clear guidelines for content formatting, cross-referencing, and sitemap management, this approach ensures that the CLMS data repository remains up-to-date and well-organized. This in turn supports the training of AI models to help users find exactly the CLMS products they need, whether through direct query or generative chatbot interaction.</p>
</section>
</div>
<div style="font-family: 'Times New Roman', serif; text-align: justify;">
<section id="introduction" class="level1">
<h1>1. Introduction</h1>
<section id="importance-of-copernicus-land-monitoring-service-clms" class="level2">
<h2 class="anchored" data-anchor-id="importance-of-copernicus-land-monitoring-service-clms">1.1. Importance of Copernicus Land Monitoring Service (CLMS)</h2>
<p>The Copernicus Land Monitoring Service (CLMS) is a critical component of the Copernicus Programme, which is the European Union’s Earth observation initiative <span class="citation" data-cites="CLMS"><a href="#ref-CLMS" role="doc-biblioref">[1]</a></span>. The service is responsible for providing timely and accurate land cover and land use data, along with a wide range of environmental variables related to land ecosystems. This data is essential for understanding and managing Europe’s environmental resources, supporting sustainable development, climate monitoring, and informed policy-making. The key areas where CLMS is vital include:</p>
<ul>
<li><p><strong>Environmental Monitoring</strong>: CLMS provides data on land cover, vegetation, soil, and water bodies, which are crucial for monitoring environmental changes such as deforestation, urban sprawl, and the health of ecosystems. This data supports conservation efforts and helps in tracking biodiversity and land degradation.</p></li>
<li><p><strong>Sustainable Land Management</strong>: With the growing need for sustainable practices, CLMS delivers data that helps governments and organizations plan and manage land resources more effectively. It supports agriculture, forestry, water management, and urban planning, helping to mitigate the effects of climate change.</p></li>
<li><p><strong>Climate Change Monitoring</strong>: CLMS plays a significant role in assessing the impact of climate change on European landscapes. It helps track changes in land use, vegetation, and land surface temperatures, which are important indicators of climate change impacts.</p></li>
<li><p><strong>Disaster Management</strong>: CLMS data is used for emergency response and disaster management, especially in cases of floods, fires, and other natural disasters. The accurate and near-real-time data allows authorities to take preventive actions and make quick decisions during emergencies.</p></li>
<li><p><strong>Policy Support and Decision-Making</strong>: The service supports EU environmental policies, including the Green Deal, Common Agricultural Policy (CAP), and the EU Biodiversity Strategy. The data provided by CLMS informs decision-makers at the European, national, and local levels, ensuring that policies are grounded in the latest environmental data.</p></li>
</ul>
</section>
<section id="importance-of-clms-documentation-for-web-crawlers-enhancing-product-discoverability-and-findability" class="level2">
<h2 class="anchored" data-anchor-id="importance-of-clms-documentation-for-web-crawlers-enhancing-product-discoverability-and-findability">1.2. Importance of CLMS Documentation for Web Crawlers: Enhancing Product Discoverability and Findability</h2>
<p>The discoverability and findability of CLMS products on the web are crucial for ensuring that this valuable environmental data is accessible to a wide range of users, including researchers, policymakers, and environmental organizations. Making CLMS documentation available on the web for crawlers facilitates product discoverability by enabling search engines and AI-powered systems (like generative chatbots) to index, retrieve, and present relevant data to users. Here’s why ensuring that CLMS documents are available to web crawlers is essential:</p>
<ul>
<li><p><strong>Increased Accessibility for Diverse Users</strong>: CLMS products cater to a broad audience, including government agencies, NGOs, scientists, and the public. Properly formatted and exposed documentation allows these users to easily find and access data via search engines. Web crawlers can efficiently index CLMS products, simplifying the search for specific datasets without navigating complex databases.</p></li>
<li><p><strong>Enhanced Search Engine Optimization (SEO)</strong>: CLMS products cater to a broad audience, including government agencies, NGOs, scientists, and the public. Properly formatted and exposed documentation allows these users to easily find and access data via search engines. Web crawlers can efficiently index CLMS products, simplifying the search for specific datasets without navigating complex databases.</p></li>
<li><p><strong>Improved Product Findability Through AI and Chatbots</strong>: AI-powered search tools and chatbots use indexed information to generate responses. By ensuring that CLMS documentation is structured for crawling, CLMS products become accessible to third-party chatbots, expanding their reach through natural language queries and conversational interfaces.</p></li>
<li><p><strong>Faster and More Accurate Data Retrieval</strong>: Well-formatted CLMS documents enable faster and more accurate data retrieval, essential for time-sensitive applications like disaster management. Proper crawling ensures that search engines and AI systems provide up-to-date CLMS products, crucial for timely decision-making.</p></li>
<li><p><strong>Standardization and Interoperability</strong>: Adopting CLMS standards and formats like Quarto Markdown ensures consistency, making documents easier to index and retrieve. Standardization promotes interoperability, allowing CLMS data to be used across various platforms, including AI systems and environmental tools.</p></li>
<li><p><strong>Global Reach and Broader Impact</strong>: Making CLMS documents available to web crawlers increases their global reach. Optimized data allows users worldwide to access key environmental information, contributing to global initiatives, research, and policymaking beyond the EU.</p></li>
<li><p><strong>Supporting Third-Party Integration</strong>: Third-party platforms rely on web crawlers and AI tools to access CLMS data. By exposing CLMS products to crawlers, the data can be integrated into various tools and services, enhancing discoverability and promoting broader use in AI-driven analytics and public services.</p></li>
</ul>
<p>By making CLMS documents available to web crawlers using standardized formats such as HTML, PDF, and DOCX (which adhere to semantic structure, web standards, and use metadata), CLMS can ensure that its products are easily indexed, retrieved, and integrated into a variety of search engines, artificial intelligence systems, and chatbots. This strategy not only increases the visibility of CLMS products, but also improves accessibility to a global audience, ensuring that researchers, policymakers, and the public can effectively find and use CLMS data. At a time when timely, accurate environmental data is becoming increasingly important, optimizing CLMS products for web crawlers is a necessary step to ensure that everyone has access to these valuable resources.</p>
</section>
<section id="web-crawling-and-information-provisioning-for-generative-chatbots" class="level2">
<h2 class="anchored" data-anchor-id="web-crawling-and-information-provisioning-for-generative-chatbots">1.3. Web crawling and Information Provisioning for Generative Chatbots</h2>
<p>Web crawling is the process used by search engines to explore and index the web pages of websites. The crawler downloads pages, reads the content, and adds it to the search engine’s index. Crawlers are designed to navigate from one page to another by following hyperlinks, allowing them to efficiently cover a website’s entire structure. Search engines rely on crawlers to keep their results up-to-date by regularly visiting websites and checking for new or modified content. <strong>Googlebot</strong>, <strong>Bingbot</strong>, and <strong>Yahoo Slurp</strong> are some example of popular web crawlers. Key terms involved in web crawling are:</p>
<ul>
<li><strong>Search engine</strong>: A system that allows users to search for content on the web.</li>
<li><strong>Indexing</strong>: The process of storing web content so it can be retrieved later.</li>
<li><strong>Web pages</strong>: Documents that make up the web, interconnected by hyperlinks.</li>
<li><strong>Hyperlinks</strong>: Links that connect different web pages, forming a navigable web.</li>
</ul>
<p>Web crawling has become essential for search engines and AI applications. The integration of these technologies has been explored extensively <span class="citation" data-cites="khder2021web massimino2016accessing kausar2013web saini2016information"><a href="#ref-khder2021web" role="doc-biblioref">[2]</a>, <a href="#ref-massimino2016accessing" role="doc-biblioref">[3]</a>, <a href="#ref-kausar2013web" role="doc-biblioref">[4]</a>, <a href="#ref-saini2016information" role="doc-biblioref">[5]</a></span>. The growth of digital content has placed significant demands on the efficiency and accuracy of web crawlers and artificial intelligence (AI) models <span class="citation" data-cites="hernandez2019deep deshmukh2021survey"><a href="#ref-hernandez2019deep" role="doc-biblioref">[6]</a>, <a href="#ref-deshmukh2021survey" role="doc-biblioref">[7]</a></span>. In response, Content Lifecycle Management Standards (CLMS) are essential for establishing uniformity in the way data is formatted, structured, and exposed for automated tools like crawlers and AI training datasets. CLMS helps ensure that content is easy to access, interpret, and process, leading to more accurate information retrieval and AI model training. This document outlines the development of CLMS standards for exposing information to web crawlers and optimizing the formatting for AI data ingestion. <a href="#fig-ai-training" class="quarto-xref">Figure 1</a> focuses on the working of a web crawler <span class="citation" data-cites="Crawl"><a href="#ref-Crawl" role="doc-biblioref">[8]</a></span>.</p>
<div id="fig-ai-training" class="quarto-float quarto-figure quarto-figure-center anchored" data-align="center">
<figure class="quarto-float quarto-float-fig figure">
<div aria-describedby="fig-ai-training-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
<img src="images/web_crawling_diagram.png" class="img-fluid figure-img" style="width:65.0%" data-align="center">
</div>
<figcaption class="quarto-float-caption-bottom quarto-float-caption quarto-float-fig" id="fig-ai-training-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
Figure 1: Diagram illustrating web crawling <span class="citation" data-cites="Crawl"><a href="#ref-Crawl" role="doc-biblioref">[8]</a></span>.
</figcaption>
</figure>
</div>
<p>In recent years, generative chatbots have made great progress and become powerful tools that allow users to access detailed information and conduct complex queries. In particular, chatbots can help users explore certain aspects of CLMS products, such as allocation rules or the purpose of a particular product. These tools are not only critical for product discoverability, but also improve user understanding of CLMS products. To ensure that chatbots effectively help users find and understand CLMS products, it is important that the underlying information is formatted and presented in a way that is easy to find and use. This requires well-structured documentation and a system that allows web crawlers and AI models to effectively access and process CLMS data.</p>
<p>Web crawlers and AI models are critical to the discoverability of online information. Web crawlers that index websites rely on well-structured content to perform their tasks effectively. Similarly, generative AI models, including chatbots, require high-quality structured data to produce accurate and meaningful results. CLMS provides important environmental data, but in order for this data to be useful to AI models and easy for users to find, it must be properly formatted and made available.</p>
<section id="motivation" class="level3">
<h3 class="anchored" data-anchor-id="motivation">1.3.1. Motivation</h3>
<p>The relationship between AI and web crawlers has led to new frontiers in both industries. The primary motivation for creating CLMS standards lies in the need for:</p>
<ul>
<li><p><strong>Improved Crawling Efficiency</strong>: Properly formatted content with metadata helps crawlers index relevant information faster and more accurately.</p></li>
<li><p><strong>Better AI Model Training</strong>: Consistent content structure ensures that AI models are trained on high-quality, organized data.</p></li>
<li><p><strong>Data Accessibility</strong>: Standardizing the structure of content ensures that information is universally accessible across platforms.</p></li>
</ul>
<p>The following key aspects are critical for ensuring that data is structured and accessible for web crawlers and AI systems:</p>
<ul>
<li><p><strong>Uniform metadata</strong>: Consistent metadata usage across all content is essential. Metadata includes details like title, author, keywords, and publication date. Uniform metadata ensures that web crawlers and AI systems can easily index and categorize content, improving searchability and discoverability.</p></li>
<li><p><strong>Clearly defined content sections</strong>: Content should be organized into distinct sections, such as titles, headings, and subheadings. This structured format helps both users and machines navigate through the content efficiently, making key information easy to locate and retrieve.</p></li>
<li><p><strong>Embedded structured data formats</strong>: Incorporating structured data formats such as <strong>JSON-LD</strong>, <strong>RDF</strong>, or <strong>XML</strong> provides a precise way of representing information. These formats help web crawlers and AI systems understand relationships and attributes within the content, facilitating accurate extraction, interpretation, and use of the data across various platforms.</p></li>
</ul>
</section>
<section id="importance" class="level3">
<h3 class="anchored" data-anchor-id="importance">1.3.2. Importance</h3>
<ul>
<li><p><strong>Enhanced Web Crawling</strong>: Properly structured CLMS content will improve web crawlers’ ability to index and retrieve information.</p></li>
<li><p><strong>Improved AI Training</strong>: Structured data ensures higher-quality datasets, which result in better-trained AI models, particularly for generative chatbots.</p></li>
<li><p><strong>Better User Experience</strong>: By improving product discoverability and findability, users will have an easier time accessing and understanding CLMS products.</p></li>
</ul>
<div class="callout callout-style-default callout-tip callout-titled">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-title-container flex-fill">
Tip
</div>
</div>
<div class="callout-body-container callout-body">
<p>Given the growing complexity of CLMS products and the increasing reliance on generative AI tools, it is critical to implement standards that improve the discoverability and usability of CLMS data.</p>
</div>
</div>
<div class="callout callout-style-default callout-note callout-titled">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-title-container flex-fill">
Note
</div>
</div>
<div class="callout-body-container callout-body">
<p>By standardizing the format and delivery of CLMS information, our goal is to ensure that generative AI applications, such as web crawlers and chatbots, can effectively access and use this data.</p>
</div>
</div>
</section>
</section>
</section>
<section id="content-standards" class="level1">
<h1>2. Content Standards</h1>
<p>Developing content standards requires collaboration between content creators, data engineers, and AI researchers. The process typically follows these stages for different document types in use:</p>
<section id="content-structuring" class="level2">
<h2 class="anchored" data-anchor-id="content-structuring">2.1. Content Structuring</h2>
<p>Content structuring involves organizing data into recognizable, standard components, such as:</p>
<ul>
<li><p><strong>Title</strong>: Main identifier of the content.</p></li>
<li><p><strong>Metadata</strong>: Information about the content, including authors, dates, keywords, and relevant classification.</p></li>
<li><p><strong>Headings and Subheadings</strong>: Structured sections that break down the content into digestible parts.</p></li>
</ul>
<p>The example of Metadata formatting has been given below:</p>
<div class="sourceCode" id="cb1"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span>
<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="fu">title</span><span class="kw">:</span><span class="at"> </span><span class="st">"Developing CLMS Standards for Generative AI Training and Web Crawlers"</span></span>
<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="fu">subtitle</span><span class="kw">:</span><span class="at"> </span><span class="st">"Task 10.1: Information Provisioning for Generative Chatbots"</span></span>
<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="fu">author</span><span class="kw">:</span><span class="at"> </span><span class="st">"Ayan Chatterjee, Department of DIGITAL, NILU, [email protected]."</span></span>
<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a><span class="fu">date</span><span class="kw">:</span><span class="at"> </span><span class="st">"2024-09-10"</span></span>
<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="fu">sitemap</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span><span class="co"> #Enables sitemap generation for web crawlers</span></span>
<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a><span class="fu">toc</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span><span class="co"> # Enable the Table of Contents</span></span>
<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a><span class="fu">toc-title</span><span class="kw">:</span><span class="at"> </span><span class="st">"Index"</span><span class="co"> # Customize the title of the table of contents</span></span>
<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a><span class="fu">toc-depth</span><span class="kw">:</span><span class="at"> </span><span class="dv">3</span><span class="co"> # Include headings up to level 3 (</span><span class="al">###</span><span class="co">)</span></span>
<span id="cb1-10"><a href="#cb1-10" aria-hidden="true" tabindex="-1"></a><span class="fu">keywords</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"CLMS standards"</span><span class="kw">,</span><span class="at"> </span><span class="st">"web crawlers"</span><span class="kw">,</span><span class="at"> </span><span class="st">"AI training"</span><span class="kw">,</span><span class="at"> </span><span class="st">"information formatting"</span><span class="kw">]</span></span>
<span id="cb1-11"><a href="#cb1-11" aria-hidden="true" tabindex="-1"></a><span class="fu">bibliography</span><span class="kw">:</span><span class="at"> references.bib</span><span class="co"> # Link to the bibliography file</span></span>
<span id="cb1-12"><a href="#cb1-12" aria-hidden="true" tabindex="-1"></a><span class="fu">csl</span><span class="kw">:</span><span class="at"> ieee.csl</span><span class="co"> # Link to the CSL file for IEEE style</span></span>
<span id="cb1-13"><a href="#cb1-13" aria-hidden="true" tabindex="-1"></a><span class="fu">format</span><span class="kw">:</span><span class="at"> </span></span>
<span id="cb1-14"><a href="#cb1-14" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">html</span><span class="kw">:</span><span class="at"> default</span></span>
<span id="cb1-15"><a href="#cb1-15" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">pdf</span><span class="kw">:</span><span class="at"> default</span></span>
<span id="cb1-16"><a href="#cb1-16" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">docx</span><span class="kw">:</span><span class="at"> default</span></span>
<span id="cb1-17"><a href="#cb1-17" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</section>
<section id="html-structuring" class="level2">
<h2 class="anchored" data-anchor-id="html-structuring">2.2. HTML Structuring</h2>
<p>The following structured approach in HTML allows web crawlers to effectively index and retrieve content while facilitating AI training for generative models, ensuring that information is both accessible and usable:</p>
<section id="semantic-structuring-and-formatting" class="level3">
<h3 class="anchored" data-anchor-id="semantic-structuring-and-formatting">2.2.1. Semantic Structuring and Formatting</h3>
<p>It is used to enhance both <strong>machine readability</strong> and <strong>user comprehension</strong>, we must follow structured and semantic formatting principles. This includes using HTML5 elements, schema markup, and providing clear metadata. Using HTML5 semantic elements like <code><article></code>, <code><section></code>, <code><header></code>, and <code><footer></code> helps structure the document meaningfully. For example:</p>
<div class="sourceCode" id="cb2"><pre class="sourceCode html code-with-copy"><code class="sourceCode html"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="dt"><</span><span class="kw">article</span><span class="dt">></span></span>
<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a> <span class="dt"><</span><span class="kw">header</span><span class="dt">></span></span>
<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a> <span class="dt"><</span><span class="kw">h1</span><span class="dt">></span>Understanding Web Crawlers<span class="dt"></</span><span class="kw">h1</span><span class="dt">></span></span>
<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a> <span class="dt"><</span><span class="kw">meta</span><span class="ot"> name</span><span class="op">=</span><span class="st">"description"</span><span class="ot"> content</span><span class="op">=</span><span class="st">"How web crawlers work and index ..!"</span><span class="ot"> </span><span class="dt">/></span></span>
<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a> <span class="dt"></</span><span class="kw">header</span><span class="dt">></span></span>
<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a> <span class="dt"><</span><span class="kw">section</span><span class="dt">></span></span>
<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a> <span class="dt"><</span><span class="kw">h2</span><span class="dt">></span>How Crawlers Index Content<span class="dt"></</span><span class="kw">h2</span><span class="dt">></span></span>
<span id="cb2-8"><a href="#cb2-8" aria-hidden="true" tabindex="-1"></a> <span class="dt"><</span><span class="kw">p</span><span class="dt">></span>Web crawlers use semantic structure to efficiently index web pages.<span class="dt"></</span><span class="kw">p</span><span class="dt">></span></span>
<span id="cb2-9"><a href="#cb2-9" aria-hidden="true" tabindex="-1"></a> <span class="dt"></</span><span class="kw">section</span><span class="dt">></span></span>
<span id="cb2-10"><a href="#cb2-10" aria-hidden="true" tabindex="-1"></a> <span class="dt"><</span><span class="kw">footer</span><span class="dt">></span></span>
<span id="cb2-11"><a href="#cb2-11" aria-hidden="true" tabindex="-1"></a> <span class="dt"><</span><span class="kw">p</span><span class="dt">></span>Author: Ayan Chatterjee<span class="dt"></</span><span class="kw">p</span><span class="dt">></span></span>
<span id="cb2-12"><a href="#cb2-12" aria-hidden="true" tabindex="-1"></a> <span class="dt"></</span><span class="kw">footer</span><span class="dt">></span></span>
<span id="cb2-13"><a href="#cb2-13" aria-hidden="true" tabindex="-1"></a><span class="dt"></</span><span class="kw">article</span><span class="dt">></span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</section>
<section id="microdata-for-enhancing-machine-readability" class="level3">
<h3 class="anchored" data-anchor-id="microdata-for-enhancing-machine-readability">2.2.2. Microdata for Enhancing Machine Readability</h3>
<p>Microdata attributes such as itemscope, itemtype, and itemprop provide semantic clarity for machines, enabling more efficient crawling and interpretation.</p>
<div class="sourceCode" id="cb3"><pre class="sourceCode html code-with-copy"><code class="sourceCode html"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="dt"><</span><span class="kw">article</span><span class="ot"> itemscope itemtype</span><span class="op">=</span><span class="st">"https://schema.org/Article"</span><span class="dt">></span></span>
<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a> <span class="dt"><</span><span class="kw">header</span><span class="dt">></span></span>
<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a> <span class="dt"><</span><span class="kw">h1</span><span class="ot"> itemprop</span><span class="op">=</span><span class="st">"headline"</span><span class="dt">></span>Web Crawling Explained<span class="dt"></</span><span class="kw">h1</span><span class="dt">></span></span>
<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a> <span class="dt"><</span><span class="kw">meta</span><span class="ot"> itemprop</span><span class="op">=</span><span class="st">"description"</span><span class="ot"> content</span><span class="op">=</span><span class="st">"How web crawlers index ..?"</span><span class="ot"> </span><span class="dt">/></span></span>
<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a> <span class="dt"></</span><span class="kw">header</span><span class="dt">></span></span>
<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a><span class="dt"></</span><span class="kw">article</span><span class="dt">></span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</section>
<section id="schema-markup-for-structured-content" class="level3">
<h3 class="anchored" data-anchor-id="schema-markup-for-structured-content">2.2.3. Schema Markup for Structured Content</h3>
<p>Use Schema Markup (like ResearchArticle, Dataset, or CreativeWork) to define the content type and enhance machine readability. This helps both web crawlers and AI to categorize content accurately.</p>
<div class="sourceCode" id="cb4"><pre class="sourceCode html code-with-copy"><code class="sourceCode html"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="dt"><</span><span class="kw">article</span><span class="ot"> itemscope itemtype</span><span class="op">=</span><span class="st">"https://schema.org/ResearchArticle"</span><span class="dt">></span></span>
<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a> <span class="dt"><</span><span class="kw">header</span><span class="dt">></span></span>
<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a> <span class="dt"><</span><span class="kw">h1</span><span class="ot"> itemprop</span><span class="op">=</span><span class="st">"headline"</span><span class="dt">></span>AI Training for Web Crawlers<span class="dt"></</span><span class="kw">h1</span><span class="dt">></span></span>
<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a> <span class="dt"><</span><span class="kw">meta</span><span class="ot"> itemprop</span><span class="op">=</span><span class="st">"description"</span><span class="ot"> content</span><span class="op">=</span><span class="st">" AI training techniques for .."</span><span class="ot"> </span><span class="dt">/></span></span>
<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a> <span class="dt"></</span><span class="kw">header</span><span class="dt">></span></span>
<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a><span class="dt"></</span><span class="kw">article</span><span class="dt">></span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</section>
<section id="headings-and-subheadings" class="level3">
<h3 class="anchored" data-anchor-id="headings-and-subheadings">2.2.4. Headings and Subheadings</h3>
<p>Provide clearly defined headings and subheadings to organize content for easier navigation and indexing by crawlers.</p>
<div class="sourceCode" id="cb5"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span>
<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a><span class="co"># How AI Models are Trained</span></span>
<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a><span class="co">## Data Collection</span></span>
<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a><span class="co">## Model Training</span></span>
<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a><span class="co">## Evaluation</span></span>
<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</section>
<section id="alt-text-and-descriptions" class="level3">
<h3 class="anchored" data-anchor-id="alt-text-and-descriptions">2.2.5. Alt Text and Descriptions</h3>
<p>For images and diagrams, always provide alt text and descriptions to improve accessibility.</p>
<div class="sourceCode" id="cb6"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="at">![A diagram illustrating how web crawlers work]</span></span>
<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a><span class="at">(images/web_crawlers.png){alt="A diagram of web crawler processes" width=50%}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</section>
<section id="meta-tags-and-descriptions" class="level3">
<h3 class="anchored" data-anchor-id="meta-tags-and-descriptions">2.2.6. Meta Tags and Descriptions</h3>
<p>Add meta tags and descriptions to help web crawlers index the content more accurately</p>
<div class="sourceCode" id="cb7"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="at"><meta name="description" content="How web crawlers work effectively!" /></span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</section>
<section id="phrasing-and-content-presentation" class="level3">
<h3 class="anchored" data-anchor-id="phrasing-and-content-presentation">2.2.7. Phrasing and Content Presentation</h3>
<p>Ensure that important keywords are present in titles, headings, and throughout the content without overusing them (avoid keyword stuffing).</p>
<div class="sourceCode" id="cb8"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Introduction to Web Crawlers and AI Training</span></span>
<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a><span class="at">Web crawlers, also known as spiders, are used by search engines to index web ...</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<p>Write in a clear and concise manner. Avoid jargon unless necessary, and ensure that key concepts are easy to understand.</p>
<div class="sourceCode" id="cb9"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="at">Web crawlers automatically scan websites to collect and index content. </span></span>
<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a><span class="at">They follow links, downloading web pages and saving them for future queries.</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<p>Use hyperlinks and cross-references to guide both users and web crawlers to related content.</p>
<div class="sourceCode" id="cb10"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="at">For more details, see the [Introduction to AI Training](</span><span class="co">#data-collection).</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<p>Provide a brief abstract or summary at the beginning of each article or section for better clarity and indexing.</p>
<div class="sourceCode" id="cb11"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="ot">**Summary:**</span><span class="at"> This article provides an overview of indexing content, </span></span>
<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a><span class="at">and their integration with AI.</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</section>
<section id="structured-data-repositories" class="level3">
<h3 class="anchored" data-anchor-id="structured-data-repositories">2.2.8. Structured Data Repositories</h3>
<p>It is used to enable knowledge transfer to generative AI, use standardized formats like JSON-LD, RDF, or XML to define metadata and structure.</p>
<div class="sourceCode" id="cb12"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="kw">{</span></span>
<span id="cb12-2"><a href="#cb12-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">"@context"</span><span class="kw">:</span><span class="at"> </span><span class="st">"https://schema.org"</span><span class="kw">,</span></span>
<span id="cb12-3"><a href="#cb12-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">"@type"</span><span class="kw">:</span><span class="at"> </span><span class="st">"Dataset"</span><span class="kw">,</span></span>
<span id="cb12-4"><a href="#cb12-4" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">"name"</span><span class="kw">:</span><span class="at"> </span><span class="st">"AI Training Dataset"</span><span class="kw">,</span></span>
<span id="cb12-5"><a href="#cb12-5" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">"description"</span><span class="kw">:</span><span class="at"> </span><span class="st">"A dataset designed to improve search engine crawlers."</span></span>
<span id="cb12-6"><a href="#cb12-6" aria-hidden="true" tabindex="-1"></a><span class="kw">}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="sourceCode" id="cb13"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="at"><dataset xmlns="http://www.w3.org/2001/XMLSchema-instance" type="AI Training Dataset"></span></span>
<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a><span class="at"> <name>AI Training Dataset</name></span></span>
<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a><span class="at"> <description>A dataset designed for training AI models.</description></span></span>
<span id="cb13-4"><a href="#cb13-4" aria-hidden="true" tabindex="-1"></a><span class="at"></dataset></span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</section>
</section>
<section id="pdf-structuring" class="level2">
<h2 class="anchored" data-anchor-id="pdf-structuring">2.3. PDF Structuring</h2>
<p>The following structured approach in PDF will improve documents for indexing by web crawlers, integration with AI systems, and overall improved accessibility for users:</p>
<section id="accessible-pdf-formats-by-tagging" class="level3">
<h3 class="anchored" data-anchor-id="accessible-pdf-formats-by-tagging">2.3.1. Accessible PDF Formats by Tagging</h3>
<p>Ensure that the PDF is tagged properly so that screen readers and AI tools can interpret the document structure. For instance, headings, paragraphs, and lists should be tagged semantically.</p>
<div class="sourceCode" id="cb14"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Heading 1 (tagged as <h1>)</span></span>
<span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a><span class="kw">-</span><span class="at"> List item 1 (tagged as <ul><li>)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</section>
<section id="structuring-and-formatting" class="level3">
<h3 class="anchored" data-anchor-id="structuring-and-formatting">2.3.2. Structuring and Formatting</h3>
<p>The document structure should be accessible, with a clear hierarchy and a clickable table of contents (TOC). Accessible tagging, hierarchical organization, and text over image improve the usability for both humans and machines.</p>
<p>Organize content into a well-defined hierarchy using headings (#, ##, ###). This improves both user navigation and machine parsing for AI and web crawlers.</p>
<div class="sourceCode" id="cb15"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="co">## Section 1: Introduction</span></span>
<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a><span class="co">### Subsection 1.1: Overview</span></span>
<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a><span class="fu">toc</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
<span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a><span class="fu">toc-depth</span><span class="kw">:</span><span class="at"> </span><span class="dv">2</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</section>
<section id="adding-metadata" class="level3">
<h3 class="anchored" data-anchor-id="adding-metadata">2.3.3. Adding Metadata</h3>
<p>Embedding metadata such as document properties (e.g., Title, Author, Subject, and Keywords), XMP metadata, Schema.org metadata, and descriptive metadata helps search engines and AI systems index, categorize, and retrieve information efficiently.</p>
<div class="sourceCode" id="cb16"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a><span class="fu">title</span><span class="kw">:</span><span class="at"> </span><span class="st">"PDF Structuring and Formatting"</span></span>
<span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a><span class="fu">author</span><span class="kw">:</span><span class="at"> </span><span class="st">"Ayan Chatterjee"</span></span>
<span id="cb16-3"><a href="#cb16-3" aria-hidden="true" tabindex="-1"></a><span class="fu">subject</span><span class="kw">:</span><span class="at"> </span><span class="st">"Document Accessibility and Metadata"</span></span>
<span id="cb16-4"><a href="#cb16-4" aria-hidden="true" tabindex="-1"></a><span class="fu">keywords</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"PDF accessibility"</span><span class="kw">,</span><span class="at"> </span><span class="st">"metadata"</span><span class="kw">,</span><span class="at"> </span><span class="st">"AI integration"</span><span class="kw">]</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<p>XMP metadata is stored as XML in the PDF file, allowing for rich data descriptions. Schema.org metadata in JSON-LD provide structured information that AI and web crawlers can easily understand.</p>
<div class="sourceCode" id="cb17"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a><span class="kw">{</span></span>
<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">"@context"</span><span class="kw">:</span><span class="at"> </span><span class="st">"https://schema.org"</span><span class="kw">,</span></span>
<span id="cb17-3"><a href="#cb17-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">"@type"</span><span class="kw">:</span><span class="at"> </span><span class="st">"CreativeWork"</span><span class="kw">,</span></span>
<span id="cb17-4"><a href="#cb17-4" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">"name"</span><span class="kw">:</span><span class="at"> </span><span class="st">"PDF Structuring and Formatting"</span><span class="kw">,</span></span>
<span id="cb17-5"><a href="#cb17-5" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">"author"</span><span class="kw">:</span><span class="at"> </span><span class="kw">{</span></span>
<span id="cb17-6"><a href="#cb17-6" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">"@type"</span><span class="kw">:</span><span class="at"> </span><span class="st">"Person"</span><span class="kw">,</span></span>
<span id="cb17-7"><a href="#cb17-7" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">"name"</span><span class="kw">:</span><span class="at"> </span><span class="st">"Jane Doe"</span></span>
<span id="cb17-8"><a href="#cb17-8" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">},</span></span>
<span id="cb17-9"><a href="#cb17-9" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">"keywords"</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"PDF accessibility"</span><span class="kw">,</span><span class="at"> </span><span class="st">"metadata"</span><span class="kw">,</span><span class="at"> </span><span class="st">"AI integration"</span><span class="kw">]</span></span>
<span id="cb17-10"><a href="#cb17-10" aria-hidden="true" tabindex="-1"></a><span class="kw">}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</section>
<section id="optimizing-content-presentation" class="level3">
<h3 class="anchored" data-anchor-id="optimizing-content-presentation">2.3.4. Optimizing Content Presentation</h3>
<p>Ensuring the proper placement of keywords, providing alt text for images, and correctly labeling figures and tables contribute to the searchability and accessibility of the content. This is crucial for effective interaction with web crawlers and AI models.</p>
<div class="sourceCode" id="cb18"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a><span class="fu">Keywords</span><span class="kw">:</span><span class="at"> PDF accessibility, web crawlers, generative AI</span></span>
<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a><span class="at">data:image/s3,"s3://crabby-images/1b7c2/1b7c29502a9d4a3f2fa2cf2f48cce4ad78fdd7a1" alt="A flowchart showing the PDF processing workflow"{alt="PDF workflow"}</span></span>
<span id="cb18-3"><a href="#cb18-3" aria-hidden="true" tabindex="-1"></a><span class="fu">data:image/s3,"s3://crabby-images/1b7c2/1b7c29502a9d4a3f2fa2cf2f48cce4ad78fdd7a1" alt="Figure 1</span><span class="kw">:</span><span class="at"> A table of contents structure"{</span><span class="co">#fig-toc}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</section>
<section id="setting-up-for-knowledge-transfer-to-generative-ai" class="level3">
<h3 class="anchored" data-anchor-id="setting-up-for-knowledge-transfer-to-generative-ai">2.3.5. Setting Up for Knowledge Transfer to Generative AI</h3>
<p>Using machine-readable fonts (e.g., Arial, Times New Roman), a clean and simple layout, and adding comments or annotations helps prepare the document for use in generative AI systems. AI models benefit from well-structured and easy-to-parse content, which improves their ability to understand and generate meaningful responses based on the content.</p>
<div class="sourceCode" id="cb19"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a><span class="co">## Section 1: Overview</span></span>
<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a><span class="at">This section introduces the importance of accessible PDFs for AI processing...</span></span>
<span id="cb19-3"><a href="#cb19-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb19-4"><a href="#cb19-4" aria-hidden="true" tabindex="-1"></a><span class="at"><!-- This annotation explains the role of hierarchical metadata for AI --></span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="callout callout-style-default callout-important callout-titled">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-title-container flex-fill">
Important
</div>
</div>
<div class="callout-body-container callout-body">
<p>By such structured practices, we can ensure that the content is both human-readable and machine-readable, facilitating easy discovery by web crawlers and seamless integration with AI training systems.</p>
</div>
</div>
</section>
</section>
</section>
<section id="developing-clms-standards" class="level1">
<h1>3. Developing CLMS Standards</h1>
<p>In the context of <strong>Developing CLMS Standards</strong>, it is essential to utilize advanced tools that support both the creation of well-structured documents and the easy discoverability of content for web crawlers and AI systems. Several tools are available for content formatting, documentation, and publication. Among these, <strong>Quarto</strong> stands out due to its versatility, allowing users to create, format, and publish documents in multiple formats (HTML, PDF, Word) with integrated support for code execution and structured content.</p>
<p>This section compares several of these tools, explaining why <strong>Quarto</strong> is particularly suitable for creating CLMS-compliant documentation. We’ll also cover how to configure Quarto with <strong>Jupyter Notebooks</strong> and the importance of using <strong>Quarto Markdown</strong> for CLMS content. A Quarto Markdown file provides a structured approach to documenting the development of CLMS standards, ensuring content is easily accessible by both web crawlers and AI systems.</p>
<section id="tools-for-clms-documentation" class="level2">
<h2 class="anchored" data-anchor-id="tools-for-clms-documentation">3.1. Tools for CLMS Documentation</h2>
<ul>
<li><p><strong>Quarto</strong>: Quarto is a highly versatile tool for creating and publishing documents, including PDFs, with rich formatting, code integration, and support for multiple formats (HTML, PDF, Word). Quarto’s cross-platform capabilities make it ideal for creating structured and searchable documents for CLMS, supporting web crawlers and AI applications.</p></li>
<li><p><strong>R Markdown</strong>: A popular tool in the R community that allows users to combine narrative text with R code, producing output in HTML, PDF, and Word formats. Though powerful for statistical analysis, it is more limited in non-R-based workflows compared to Quarto.</p></li>
<li><p><strong>Jupyter Notebooks</strong>: An interactive tool supporting over 40 programming languages, commonly used for data science and computing. Notebooks can be exported to multiple formats (HTML, PDF, slides), but lack Quarto’s advanced content formatting features.</p></li>
<li><p><strong>Pandoc</strong>: A universal document converter that enables conversion between various markup formats, including Markdown, LaTeX, and HTML. While powerful for conversions, Pandoc lacks the code integration and dynamic formatting of Quarto.</p></li>
<li><p><strong>LaTeX</strong>: A document preparation system for producing scientific and technical documents. While highly customizable, it requires significant expertise and lacks the ease of Markdown tools like Quarto.</p></li>
<li><p><strong>Hugo</strong>: A static site generator used for creating websites and blogs from Markdown files. While efficient for websites, it doesn’t provide the same level of document control and integration as Quarto.</p></li>
<li><p><strong>Sphinx</strong>: A documentation generator mainly used for Python projects. It supports conversion to formats like HTML and PDF but lacks the cross-language support and document versatility of Quarto.</p></li>
<li><p><strong>Bookdown</strong>: An extension of R Markdown, designed for writing books and long documents. It supports multiple output formats but is mostly R-focused, while Quarto supports multiple languages.</p></li>
<li><p><strong>GitBook</strong>: A tool for creating documentation and books using Markdown. It allows collaboration but lacks the dynamic formatting and multi-language support found in Quarto.</p></li>
<li><p><strong>Pelican</strong>: A static site generator that uses Markdown or reStructuredText. Best suited for blogs, it doesn’t provide the integrated support for complex documents required by CLMS standards.</p></li>
<li><p><strong>Typora</strong>: A WYSIWYG Markdown editor that offers easy editing but lacks the advanced document control and integration capabilities that Quarto provides.</p></li>
</ul>
<p>The comparison of tools for CLMS documentation as shown in below <a href="#tbl-indexing" class="quarto-xref">Table 1</a>. As shown in <a href="#tbl-indexing" class="quarto-xref">Table 1</a>, Quarto outperforms other tools in terms of supported output formats and reproducibility.</p>
<div id="tbl-indexing" class="quarto-float quarto-figure quarto-figure-center anchored">
<figure class="quarto-float quarto-float-tbl figure">
<figcaption class="quarto-float-caption-top quarto-float-caption quarto-float-tbl" id="tbl-indexing-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
Table 1: Comparative analysis of Quarto versus other formatting tools.
</figcaption>
<div aria-describedby="tbl-indexing-caption-0ceaefa1-69ba-4598-a22c-09a6ac19f8ca">
<table class="caption-top table">
<colgroup>
<col style="width: 14%">
<col style="width: 20%">
<col style="width: 14%">
<col style="width: 15%">
<col style="width: 20%">
<col style="width: 16%">
</colgroup>
<thead>
<tr class="header">
<th>Tool</th>
<th>Cross-Language Support</th>
<th>Output Formats</th>
<th>Code Integration</th>
<th>Static Site Generation</th>
<th>Ideal Use Case</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td><strong>Quarto</strong></td>
<td>Yes</td>
<td>HTML, PDF, Word</td>
<td>Yes</td>
<td>Yes</td>
<td>Reports, blogs, CLMS docs</td>
</tr>
<tr class="even">
<td>R Markdown</td>
<td>R only</td>
<td>HTML, PDF, Word</td>
<td>Yes (R)</td>
<td>No</td>
<td>Statistical reports</td>
</tr>
<tr class="odd">
<td>Jupyter Notebooks</td>
<td>40+ languages</td>
<td>HTML, PDF</td>
<td>Yes</td>
<td>No</td>
<td>Data Science</td>
</tr>
<tr class="even">
<td>LaTeX</td>
<td>Limited</td>
<td>PDF, HTML</td>
<td>No</td>
<td>No</td>
<td>Scientific papers</td>
</tr>
<tr class="odd">
<td>Hugo</td>
<td>No</td>
<td>HTML</td>
<td>No</td>
<td>Yes</td>
<td>Blogs, websites</td>
</tr>
<tr class="even">
<td>Sphinx</td>
<td>Python</td>
<td>HTML, PDF</td>
<td>No</td>
<td>Yes</td>
<td>Python documentation</td>
</tr>
</tbody>
</table>
</div>
</figure>
</div>
</section>
<section id="quarto-markdown" class="level2">
<h2 class="anchored" data-anchor-id="quarto-markdown">3.2. Quarto Markdown</h2>
<p>Markdown is a lightweight, easy-to-read syntax used for formatting plain text documents <span class="citation" data-cites="cookintroduction mati2023eviewsr paciorek2023example"><a href="#ref-cookintroduction" role="doc-biblioref">[9]</a>, <a href="#ref-mati2023eviewsr" role="doc-biblioref">[10]</a>, <a href="#ref-paciorek2023example" role="doc-biblioref">[11]</a></span>. In Quarto, Markdown is extended to support additional features beyond standard Markdown, allowing users to write text, integrate code, and generate richly formatted documents in various formats such as HTML, PDF, and Word <span class="citation" data-cites="cookintroduction mati2023eviewsr paciorek2023example"><a href="#ref-cookintroduction" role="doc-biblioref">[9]</a>, <a href="#ref-mati2023eviewsr" role="doc-biblioref">[10]</a>, <a href="#ref-paciorek2023example" role="doc-biblioref">[11]</a></span>. Quarto Markdown combines the simplicity of regular Markdown with powerful features for document rendering, making it ideal for data analysis, technical writing, academic papers, and reports <span class="citation" data-cites="cookintroduction mati2023eviewsr paciorek2023example"><a href="#ref-cookintroduction" role="doc-biblioref">[9]</a>, <a href="#ref-mati2023eviewsr" role="doc-biblioref">[10]</a>, <a href="#ref-paciorek2023example" role="doc-biblioref">[11]</a></span>.</p>
<p>Quarto Markdown uses the standard Markdown syntax for headings, lists, emphasis, and links, while also supporting enhanced features like cross-referencing, citations, figures, tables, mathematical equations, and more <span class="citation" data-cites="cookintroduction mati2023eviewsr paciorek2023example"><a href="#ref-cookintroduction" role="doc-biblioref">[9]</a>, <a href="#ref-mati2023eviewsr" role="doc-biblioref">[10]</a>, <a href="#ref-paciorek2023example" role="doc-biblioref">[11]</a></span>. Quarto also allows for code execution in multiple programming languages (such as Python, R, and Julia) embedded within the Markdown file, enabling dynamic document creation where the outputs are generated directly from the code <span class="citation" data-cites="cookintroduction mati2023eviewsr paciorek2023example miroshnychenko2023quarto"><a href="#ref-cookintroduction" role="doc-biblioref">[9]</a>, <a href="#ref-mati2023eviewsr" role="doc-biblioref">[10]</a>, <a href="#ref-paciorek2023example" role="doc-biblioref">[11]</a>, <a href="#ref-miroshnychenko2023quarto" role="doc-biblioref">[12]</a></span>.</p>
<p>Key features of <strong>Markdown</strong> in <strong>Quarto</strong> are:</p>
<ul>
<li><strong>Standard Markdown</strong>: Supports headings, lists, links, images, bold, italics, etc.</li>
<li><strong>YAML Header</strong>: Allows users to specify metadata like title, author, date, and output formats (HTML, PDF, Word) at the start of the document.</li>
<li><strong>Cross-references</strong>: Provides automatic numbering and referencing for figures, tables, sections, etc.</li>
<li><strong>Code Execution</strong>: Integrates code cells for multiple programming languages, making it possible to run code and include its outputs directly in the document.</li>
<li><strong>Mathematics and Equations</strong>: Supports LaTeX-style equations for technical writing.</li>
<li><strong>Citations</strong>: Allows for referencing research papers and articles using BibTeX or CSL styles.</li>
<li><strong>Multi-output Format</strong>: Enables seamless conversion to multiple formats like HTML, PDF, Word, presentations, and slides.</li>
</ul>
<section id="significance" class="level3">
<h3 class="anchored" data-anchor-id="significance">3.2.1. Significance</h3>
<p>Markdown in Quarto can be significant due to its <strong>simplicity and flexibility</strong> for CLMS documentation. With an <strong>easy-to-use syntax</strong>, it allows users to format text without requiring complex tools, making it accessible to both non-technical users and programmers. This flexibility enables the creation of a wide variety of documents, ranging from blog posts to scientific reports. Quarto extends standard Markdown by supporting <strong>rich formatting options</strong> essential for technical and academic writing, including built-in support for tables, figures, equations, footnotes, and cross-referencing. The <strong>integration of code and text</strong> is another powerful feature, allowing Quarto Markdown to embed code execution within documents. This is critical for reproducible research, enabling the inclusion of tables, charts, and figures generated directly from code, making it highly suitable for data science and technical reporting. Additionally, Quarto Markdown supports <strong>multi-format output</strong>, allowing users to create content once and export it to multiple formats like HTML, PDF, and Word, streamlining document preparation for different audiences. When used for online content, its structured format <strong>improves SEO (Search Engine Optimization)</strong>, making it easier for search engines to index and enhance discoverability. The ease of <strong>managing references, citations, and cross-references</strong> further strengthens its utility in academic and research documentation. Since Markdown files are plain text, Quarto seamlessly integrates with <strong>version control</strong> tools like Git, enabling easy <strong>collaboration</strong> among multiple contributors, especially in open-source and research communities. Finally, Quarto Markdown’s versatility in document creation extends across blogs, technical documentation, reports, scientific papers, and books, making it an ideal tool for content creators across various disciplines.</p>
</section>
<section id="configuring-quarto-with-jupyter-notebooks" class="level3">
<h3 class="anchored" data-anchor-id="configuring-quarto-with-jupyter-notebooks">3.2.2. Configuring Quarto with Jupyter Notebooks</h3>
<p>To integrate <strong>Quarto</strong> with <strong>Jupyter Notebooks</strong>:</p>
<ul>
<li><p><strong>Install Quarto</strong>: Download and install Quarto from <a href="https://quarto.org/docs/get-started/">Quarto.org</a>.</p></li>
<li><p><strong>Install Jupyter</strong>: Ensure you have Jupyter installed. If not, install it using <code>pip</code>: ```bash pip install notebook</p></li>
<li><p><strong>Rendering</strong>: You can directly write your content in Jupyter Notebooks and then render the notebook using Quarto to multiple formats: ```bash<br>
quarto render your-notebook.ipynb –to html quarto render your-notebook.ipynb –to pdf quarto render your-notebook.ipynb –to docx</p></li>
<li><p><strong>YAML Header in Jupyter</strong>:</p>
<div class="sourceCode" id="cb20"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span>
<span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a><span class="fu">title</span><span class="kw">:</span><span class="at"> </span><span class="st">"CLMS Data Analysis"</span></span>
<span id="cb20-3"><a href="#cb20-3" aria-hidden="true" tabindex="-1"></a><span class="fu">author</span><span class="kw">:</span><span class="at"> </span><span class="st">"Ayan Chatterjee"</span></span>
<span id="cb20-4"><a href="#cb20-4" aria-hidden="true" tabindex="-1"></a><span class="fu">format</span><span class="kw">:</span></span>
<span id="cb20-5"><a href="#cb20-5" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">html</span><span class="kw">:</span><span class="at"> default</span></span>
<span id="cb20-6"><a href="#cb20-6" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">pdf</span><span class="kw">:</span><span class="at"> default</span></span>
<span id="cb20-7"><a href="#cb20-7" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">docx</span><span class="kw">:</span><span class="at"> default </span></span>
<span id="cb20-8"><a href="#cb20-8" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div></li>
</ul>
</section>
</section>
<section id="indexing" class="level2">
<h2 class="anchored" data-anchor-id="indexing">3.3. Indexing</h2>
<p>Proper indexing is essential for increasing the discoverability and accessibility of CLMS products <span class="citation" data-cites="hassan2014improving coe2016website"><a href="#ref-hassan2014improving" role="doc-biblioref">[13]</a>, <a href="#ref-coe2016website" role="doc-biblioref">[14]</a></span>. By formatting documents using Quarto Markdown and generating a sitemap.xml, we can ensure that search engines and AI systems efficiently crawl and retrieve CLMS content <span class="citation" data-cites="hassan2014improving coe2016website"><a href="#ref-hassan2014improving" role="doc-biblioref">[13]</a>, <a href="#ref-coe2016website" role="doc-biblioref">[14]</a></span>. Top improve document indexing for enhanced discoverability and accessibility we can adopt the following approaches:</p>
<ul>
<li>Organize content using <strong>structured headers</strong> and <strong>metadata</strong> in Quarto Markdown.</li>
<li>Use proper keywords and descriptions in the document metadata.</li>
<li>Cross-reference related documents to create interconnected content that helps crawlers navigate.</li>
</ul>
<div class="sourceCode" id="cb21"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span>
<span id="cb21-2"><a href="#cb21-2" aria-hidden="true" tabindex="-1"></a><span class="fu">title</span><span class="kw">:</span><span class="at"> </span><span class="st">"Land Use Mapping with CLMS Data"</span></span>
<span id="cb21-3"><a href="#cb21-3" aria-hidden="true" tabindex="-1"></a><span class="fu">author</span><span class="kw">:</span><span class="at"> </span><span class="st">"Ayan Chatterjee"</span></span>
<span id="cb21-4"><a href="#cb21-4" aria-hidden="true" tabindex="-1"></a><span class="fu">date</span><span class="kw">:</span><span class="at"> </span><span class="st">"2024-08-01"</span></span>
<span id="cb21-5"><a href="#cb21-5" aria-hidden="true" tabindex="-1"></a><span class="fu">keywords</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"land use"</span><span class="kw">,</span><span class="at"> </span><span class="st">"CLMS"</span><span class="kw">,</span><span class="at"> </span><span class="st">"mapping"</span><span class="kw">,</span><span class="at"> </span><span class="st">"environment"</span><span class="kw">]</span></span>
<span id="cb21-6"><a href="#cb21-6" aria-hidden="true" tabindex="-1"></a><span class="fu">description</span><span class="kw">:</span><span class="at"> </span><span class="st">"A detailed report on how CLMS data."</span></span>
<span id="cb21-7"><a href="#cb21-7" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<section id="sitemap-generation" class="level3">
<h3 class="anchored" data-anchor-id="sitemap-generation">3.3.1. Sitemap Generation</h3>
<p>A sitemap.xml helps web crawlers discover all the content on the website <span class="citation" data-cites="hassan2014improving coe2016website"><a href="#ref-hassan2014improving" role="doc-biblioref">[13]</a>, <a href="#ref-coe2016website" role="doc-biblioref">[14]</a></span>. By providing a clear roadmap, crawlers can index each document, ensuring that all CLMS resources are available for search and AI training. By using <strong>Quarto Markdown</strong> and generating a <strong>sitemap.xml</strong>, CLMS documents can be structured in a way that improves their <strong>indexing</strong>, making them more <strong>discoverable</strong> by search engines and AI systems. This approach ensures efficient crawling, improves search engine ranking, and enhances the accessibility of CLMS products for users and AI models alike.</p>
<ul>
<li><strong>Search Engine Discoverability</strong>: Users and AI systems can easily find the indexed CLMS documents.</li>
<li><strong>Efficient Crawling</strong>: The sitemap provides a roadmap, allowing for faster and more accurate indexing.</li>
<li><strong>Increased Accessibility</strong>: Properly indexed documents are easier for users and AI to retrieve and utilize, improving the overall product visibility.</li>
</ul>
<div class="sourceCode" id="cb22"><pre class="sourceCode html code-with-copy"><code class="sourceCode html"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a><span class="dt"><</span><span class="kw">urlset</span><span class="ot"> xmlns</span><span class="op">=</span><span class="st">"http://www.sitemaps.org/schemas/sitemap/0.9"</span><span class="dt">></span></span>
<span id="cb22-2"><a href="#cb22-2" aria-hidden="true" tabindex="-1"></a> <span class="dt"><</span><span class="kw">url</span><span class="dt">></span></span>
<span id="cb22-3"><a href="#cb22-3" aria-hidden="true" tabindex="-1"></a> <span class="dt"><</span><span class="kw">loc</span><span class="dt">></span>http://example.com/clms/land-use-mapping<span class="dt"></</span><span class="kw">loc</span><span class="dt">></span></span>
<span id="cb22-4"><a href="#cb22-4" aria-hidden="true" tabindex="-1"></a> <span class="dt"><</span><span class="kw">lastmod</span><span class="dt">></span>2024-08-01<span class="dt"></</span><span class="kw">lastmod</span><span class="dt">></span></span>
<span id="cb22-5"><a href="#cb22-5" aria-hidden="true" tabindex="-1"></a> <span class="dt"><</span><span class="kw">changefreq</span><span class="dt">></span>monthly<span class="dt"></</span><span class="kw">changefreq</span><span class="dt">></span></span>
<span id="cb22-6"><a href="#cb22-6" aria-hidden="true" tabindex="-1"></a> <span class="dt"></</span><span class="kw">url</span><span class="dt">></span></span>
<span id="cb22-7"><a href="#cb22-7" aria-hidden="true" tabindex="-1"></a> <span class="dt"><</span><span class="kw">url</span><span class="dt">></span></span>
<span id="cb22-8"><a href="#cb22-8" aria-hidden="true" tabindex="-1"></a> <span class="dt"><</span><span class="kw">loc</span><span class="dt">></span>http://example.com/clms/land-cover-change<span class="dt"></</span><span class="kw">loc</span><span class="dt">></span></span>
<span id="cb22-9"><a href="#cb22-9" aria-hidden="true" tabindex="-1"></a> <span class="dt"><</span><span class="kw">lastmod</span><span class="dt">></span>2024-07-15<span class="dt"></</span><span class="kw">lastmod</span><span class="dt">></span></span>
<span id="cb22-10"><a href="#cb22-10" aria-hidden="true" tabindex="-1"></a> <span class="dt"><</span><span class="kw">changefreq</span><span class="dt">></span>monthly<span class="dt"></</span><span class="kw">changefreq</span><span class="dt">></span></span>
<span id="cb22-11"><a href="#cb22-11" aria-hidden="true" tabindex="-1"></a> <span class="dt"></</span><span class="kw">url</span><span class="dt">></span></span>
<span id="cb22-12"><a href="#cb22-12" aria-hidden="true" tabindex="-1"></a><span class="dt"></</span><span class="kw">urlset</span><span class="dt">></span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
</section>
<section id="steps-to-implement-and-submit-the-sitemap" class="level3">
<h3 class="anchored" data-anchor-id="steps-to-implement-and-submit-the-sitemap">3.3.2. Steps to Implement and Submit the Sitemap</h3>
<ul>
<li><p><strong>Generate the Sitemap</strong>: Use a sitemap generator tool (e.g., XML-Sitemaps or Screaming Frog) to create a sitemap, or have it generated automatically by a CMS like WordPress or a static site generator like Hugo.</p></li>
<li><p><strong>Upload the Sitemap</strong>: Once generated, place the sitemap.xml file in the root directory of your website, e.g., https://www.example.com/sitemap.xml.</p></li>
<li><p><strong>Submit to Search Engines</strong>: Submit your sitemap to search engines via tools like Google Search Console and Bing Webmaster Tools. This helps search engines index your site properly.</p></li>
</ul>
</section>
<section id="enhancing-indexing-for-web-crawlers-and-ai-models" class="level3">
<h3 class="anchored" data-anchor-id="enhancing-indexing-for-web-crawlers-and-ai-models">3.3.3. Enhancing Indexing for Web Crawlers and AI Models</h3>
<p>To ensure that CLMS documents are findable and accessible to web crawlers and AI models, it’s important to implement proper steps for generating and submitting a sitemap and using structured data (such as metadata and JSON-LD) to enhance indexing.</p>
<ul>
<li><p><strong>Descriptive Filenames</strong>: Use filenames that clearly describe the content of the document. For instance, instead of doc1.md, use clms-land-monitoring-data.md.</p></li>
<li><p><strong>Metadata</strong>: Add descriptive metadata in your Quarto Markdown files (e.g., title, author, keywords). This helps search engines and AI models understand the content better.</p></li>
<li><p><strong>Text Content</strong>: Ensure that text content is descriptive and structured using headings and subheadings to guide crawlers.</p></li>
<li><p><strong>HTML Metadata and JSON-LD Structured Data</strong>: Use HTML metadata and JSON-LD structured data within the Quarto document to improve how your content is indexed by search engines and used by AI training systems.</p></li>
</ul>
<p>The following Quarto Markdown YAML header example demonstrates how to enhance document visibility for web crawling and AI training by including metadata and structured data. This can be part of your CLMS documentation to ensure that it is well-indexed and easy to discover.</p>
<div class="sourceCode" id="cb23"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span>
<span id="cb23-2"><a href="#cb23-2" aria-hidden="true" tabindex="-1"></a><span class="fu">title</span><span class="kw">:</span><span class="at"> </span><span class="st">"CLMS Land Monitoring Data"</span></span>
<span id="cb23-3"><a href="#cb23-3" aria-hidden="true" tabindex="-1"></a><span class="fu">author</span><span class="kw">:</span><span class="at"> </span><span class="st">"Ayan Chatterjee"</span></span>
<span id="cb23-4"><a href="#cb23-4" aria-hidden="true" tabindex="-1"></a><span class="fu">date</span><span class="kw">:</span><span class="at"> </span><span class="st">"2024-09-15"</span></span>
<span id="cb23-5"><a href="#cb23-5" aria-hidden="true" tabindex="-1"></a><span class="fu">keywords</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"CLMS"</span><span class="kw">,</span><span class="at"> </span><span class="st">"web crawling"</span><span class="kw">,</span><span class="at"> </span><span class="st">"AI training"</span><span class="kw">,</span><span class="at"> </span><span class="st">"environmental data"</span><span class="kw">]</span></span>
<span id="cb23-6"><a href="#cb23-6" aria-hidden="true" tabindex="-1"></a><span class="fu">description</span><span class="kw">:</span><span class="at"> </span><span class="st">"Comprehensive overview of CLMS land monitoring datasets, ......"</span></span>
<span id="cb23-7"><a href="#cb23-7" aria-hidden="true" tabindex="-1"></a><span class="fu">sitemap</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span><span class="co"> # Flag to include this document in the sitemap</span></span>
<span id="cb23-8"><a href="#cb23-8" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb23-9"><a href="#cb23-9" aria-hidden="true" tabindex="-1"></a><span class="co"># HTML metadata for SEO and discoverability</span></span>
<span id="cb23-10"><a href="#cb23-10" aria-hidden="true" tabindex="-1"></a><span class="fu">meta</span><span class="kw">:</span></span>
<span id="cb23-11"><a href="#cb23-11" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">name</span><span class="kw">:</span><span class="at"> </span><span class="st">"description"</span></span>
<span id="cb23-12"><a href="#cb23-12" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">content</span><span class="kw">:</span><span class="at"> </span><span class="st">"CLMS land monitoring datasets for environmental and climate ..."</span></span>
<span id="cb23-13"><a href="#cb23-13" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">name</span><span class="kw">:</span><span class="at"> </span><span class="st">"keywords"</span></span>
<span id="cb23-14"><a href="#cb23-14" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">content</span><span class="kw">:</span><span class="at"> </span><span class="st">"CLMS, land monitoring, environmental data, AI, web crawling"</span></span>
<span id="cb23-15"><a href="#cb23-15" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb23-16"><a href="#cb23-16" aria-hidden="true" tabindex="-1"></a><span class="co"># JSON-LD structured data to help search engines and AI understand the content</span></span>
<span id="cb23-17"><a href="#cb23-17" aria-hidden="true" tabindex="-1"></a><span class="fu">json-ld</span><span class="kw">:</span></span>
<span id="cb23-18"><a href="#cb23-18" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="st">"@context"</span><span class="kw">:</span><span class="at"> </span><span class="st">"https://schema.org"</span></span>
<span id="cb23-19"><a href="#cb23-19" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">"@type"</span><span class="kw">:</span><span class="at"> </span><span class="st">"Dataset"</span></span>
<span id="cb23-20"><a href="#cb23-20" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">"name"</span><span class="kw">:</span><span class="at"> </span><span class="st">"CLMS Land Monitoring Data"</span></span>
<span id="cb23-21"><a href="#cb23-21" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">"description"</span><span class="kw">:</span><span class="at"> </span><span class="st">"Detailed data on land monitoring and ...."</span></span>
<span id="cb23-22"><a href="#cb23-22" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">"url"</span><span class="kw">:</span><span class="at"> </span><span class="st">"https://www.example.com/clms-land-monitoring-data"</span></span>
<span id="cb23-23"><a href="#cb23-23" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">"keywords"</span><span class="kw">:</span><span class="at"> </span><span class="st">"land monitoring, environmental data, AI training.."</span></span>
<span id="cb23-24"><a href="#cb23-24" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">"datePublished"</span><span class="kw">:</span><span class="at"> </span><span class="st">"2024-09-15"</span></span>
<span id="cb23-25"><a href="#cb23-25" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">"creator"</span><span class="kw">:</span></span>
<span id="cb23-26"><a href="#cb23-26" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">"@type"</span><span class="kw">:</span><span class="at"> </span><span class="st">"Organization"</span></span>
<span id="cb23-27"><a href="#cb23-27" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">"name"</span><span class="kw">:</span><span class="at"> </span><span class="st">"Copernicus Land Monitoring Service"</span></span>
<span id="cb23-28"><a href="#cb23-28" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">"publisher"</span><span class="kw">:</span></span>
<span id="cb23-29"><a href="#cb23-29" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">"@type"</span><span class="kw">:</span><span class="at"> </span><span class="st">"Organization"</span></span>
<span id="cb23-30"><a href="#cb23-30" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">"name"</span><span class="kw">:</span><span class="at"> </span><span class="st">"European Environment Agency"</span></span>
<span id="cb23-31"><a href="#cb23-31" aria-hidden="true" tabindex="-1"></a><span class="pp">---</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
<div class="callout callout-style-default callout-important callout-titled">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-title-container flex-fill">
Important
</div>
</div>
<div class="callout-body-container callout-body">
<p><strong>Quarto</strong> stands out as the most versatile tool for creating CLMS-compliant documents, with cross-language support, integration of code, multiple output formats, and the ability to generate static websites.</p>
</div>
</div>
<div class="callout callout-style-default callout-important callout-titled">
<div class="callout-header d-flex align-content-center">
<div class="callout-icon-container">
<i class="callout-icon"></i>
</div>
<div class="callout-title-container flex-fill">
Important
</div>
</div>
<div class="callout-body-container callout-body">
<p>To ensure that CLMS documents are findable and accessible to web crawlers and AI models, it’s important to implement proper steps for generating and submitting a sitemap and using structured data (such as metadata and JSON-LD) to enhance indexing.</p>
</div>
</div>
</section>
</section>
</section>
<section id="recommended-standards-for-information-formatting" class="level1">
<h1>4. Recommended Standards for Information Formatting</h1>
<section id="suggested-standards" class="level2">
<h2 class="anchored" data-anchor-id="suggested-standards">4.1. Suggested Standards</h2>
<p>One of the main challenges in this task is improving the findability and discoverability of CLMS products. With the extensive range of data and services offered by CLMS, users often struggle to locate specific datasets or resources. Chatbots serve as a potential solution by guiding users to the appropriate resources. For chatbots to effectively perform this role, the data must be properly structured, categorized, and indexed. To support this:</p>
<ul>
<li><p>Documentation must be <strong>accessible to third-party chatbots</strong>. While CLMS chatbots will be the primary interaction point, external platforms should also access and retrieve relevant data. Exposing CLMS data in a structured and standardized format ensures interoperability across various chatbot systems, enhancing discoverability.</p></li>
<li><p>Recommendations will be provided on how CLMS should <strong>format and expose information</strong>. These guidelines will focus on best practices for metadata structuring, content organization, and linkable resources to optimize data formatting.</p></li>
</ul>
<p>The recommended standards for CLMS will include the use of Quarto Markdown, sitemaps, and structured metadata for web crawlers and AI systems.</p>
<ul>
<li><p><strong>Using Quarto Markdown for Data Structuring</strong>: Quarto Markdown allows for the clear organization of data, with structured sections such as headings, subheadings, and metadata fields. This makes it easier for web crawlers and AI systems to navigate the content and retrieve relevant information. Additionally, by using cross-referencing within Quarto Markdown documents, CLMS products and resources can be interconnected, providing users with a more seamless exploration experience.</p></li>
<li><p><strong>Implementing Sitemaps for Efficient Crawling</strong>: Sitemaps provide a roadmap for web crawlers, ensuring that all relevant pages and data sources are indexed. By creating comprehensive sitemaps that expose the entirety of the CLMS data repository, the task ensures that web crawlers and AI systems can efficiently discover and retrieve content. This is essential for making CLMS data easily accessible to third-party chatbots and AI platforms.</p></li>
</ul>
</section>
<section id="guideline-for-the-process-verification" class="level2">
<h2 class="anchored" data-anchor-id="guideline-for-the-process-verification">4.2. Guideline for the Process Verification</h2>
<p>We can compare the results of the search queries for both the unformatted and formatted documents. Typically, formatted documents with clear structure and metadata should provide better search accuracy because they provide more semantic meaning and context, making it easier for the search engine to retrieve relevant information. In this sub-section, we have outlined a step-by-step process for preparing and indexing documents to improve search accuracy. The focus is on comparing unformatted documents to formatted ones using Quarto Markdown, and how sitemap integration enhances search engine results.</p>
<ul>
<li>Step 1: Document Preparation
<ul>
<li>Create <strong>unformatted text</strong>, <strong>PDF</strong>, or <strong>Markdown files</strong>.</li>
<li>Create <strong>formatted documents</strong> using <strong>Quarto</strong>, which include metadata, clear headings, and semantic structure.</li>
</ul></li>
<li>Step 2: Sitemap Generation
<ul>
<li>For the formatted documents, generate a <strong>sitemap</strong> in XML format.</li>
<li>The sitemap should list all document URLs along with relevant metadata (e.g., last modified date, frequency of changes).</li>
</ul></li>
<li>Step 3: Set Up Search Engine
<ul>
<li>Choose a simple search engine library, such as <strong>Whoosh</strong>.</li>
<li>Create a <strong>search index</strong> for both sets of documents (formatted and unformatted).</li>
<li>Ensure that metadata is included in the search index for the formatted documents.</li>
</ul></li>
<li>Step 4: Develop Web Crawler
<ul>
<li>Write a simple <strong>web crawler</strong> to crawl both unformatted and formatted documents.</li>
<li>For the formatted documents, ensure the crawler uses the <strong>sitemap</strong> to guide the indexing process.</li>
</ul></li>
<li>Step 5: Test Search Accuracy
<ul>
<li>Perform search queries for common terms in both unformatted and formatted datasets.</li>
<li>Measure the relevance of search results using metrics like <strong>precision</strong>, <strong>recall</strong>, and <strong>F1 score</strong>.</li>
</ul></li>
<li>Step 6: Analyze Results
<ul>
<li>Compare the performance of the search engine on unformatted versus formatted documents.</li>
<li><strong>Hypothesis</strong>: Documents with structure and a sitemap will produce better search accuracy, yielding higher relevance in the results.</li>
</ul></li>
</ul>
<p>This Quarto Markdown setup can be used in a Jupyter notebook (ipynb) under a single section, maintaining clarity and structure in both the notebook and final rendered outputs (e.g., HTML, PDF, or DOCX).</p>
</section>
</section>
<section id="conclusion" class="level1">
<h1>5. Conclusion</h1>
<p>The <strong>European Environment Agency (EEA)</strong> recognizes the growing need for generative chatbots and natural language analysis tools to facilitate easy access to CLMS data. In response, the EEA is undertaking preparatory efforts to establish the necessary standards and infrastructure for successful chatbot integration. These activities focus on ensuring that CLMS products are <strong>findable</strong> and <strong>discoverable</strong>, enabling users, regardless of technical expertise, to access environmental data seamlessly.</p>
<p>A key part of this strategy is making CLMS documentation and data accessible to third-party generative AI platforms. By implementing standards for formatting and exposing information—particularly through <strong>Quarto Markdown</strong> and <strong>sitemaps</strong>—CLMS ensures that high-quality, structured data is available to chatbots and AI systems. This not only enhances product discoverability but also improves user experience, allowing chatbots to guide users through complex datasets and environmental resources.</p>
<p>The collaboration between CLMS and the EEA lays the groundwork for a future where AI systems can efficiently retrieve and process environmental data, supporting informed decision-making and increasing public engagement with CLMS products.</p>
</section>
<section id="references" class="level1">
<h1>6. References</h1>
</section>
</div>
<div id="quarto-appendix" class="default"><section class="quarto-appendix-contents" role="doc-bibliography" id="quarto-bibliography"><h2 class="anchored quarto-appendix-heading">References</h2><div id="refs" class="references csl-bib-body" data-entry-spacing="0" role="list">
<div id="ref-CLMS" class="csl-entry" role="listitem">
<div class="csl-left-margin">[1] </div><div class="csl-right-inline">E. Project, <span>“CLMS - copernicus land monitoring service.”</span> 2024. Available: <a href="https://land.copernicus.eu/en">https://land.copernicus.eu/en</a></div>
</div>
<div id="ref-khder2021web" class="csl-entry" role="listitem">
<div class="csl-left-margin">[2] </div><div class="csl-right-inline">M. A. Khder, <span>“Web scraping or web crawling: State of art, techniques, approaches and application.”</span> <em>International Journal of Advances in Soft Computing & Its Applications</em>, vol. 13, no. 3, 2021.</div>
</div>
<div id="ref-massimino2016accessing" class="csl-entry" role="listitem">
<div class="csl-left-margin">[3] </div><div class="csl-right-inline">B. Massimino, <span>“Accessing online data: Web-crawling and information-scraping techniques to automate the assembly of research data,”</span> <em>Journal of Business Logistics</em>, vol. 37, no. 1, pp. 34–42, 2016.</div>
</div>
<div id="ref-kausar2013web" class="csl-entry" role="listitem">
<div class="csl-left-margin">[4] </div><div class="csl-right-inline">M. A. Kausar, V. Dhaka, and S. K. Singh, <span>“Web crawler: A review,”</span> <em>International Journal of Computer Applications</em>, vol. 63, no. 2, pp. 31–36, 2013.</div>
</div>
<div id="ref-saini2016information" class="csl-entry" role="listitem">
<div class="csl-left-margin">[5] </div><div class="csl-right-inline">C. Saini and V. Arora, <span>“Information retrieval in web crawling: A survey,”</span> in <em>2016 international conference on advances in computing, communications and informatics (ICACCI)</em>, IEEE, 2016, pp. 2635–2643.</div>
</div>
<div id="ref-hernandez2019deep" class="csl-entry" role="listitem">
<div class="csl-left-margin">[6] </div><div class="csl-right-inline">I. Hernández, C. R. Rivero, and D. Ruiz, <span>“Deep web crawling: A survey,”</span> <em>World Wide Web</em>, vol. 22, pp. 1577–1610, 2019.</div>
</div>
<div id="ref-deshmukh2021survey" class="csl-entry" role="listitem">
<div class="csl-left-margin">[7] </div><div class="csl-right-inline">S. Deshmukh and K. Vishwakarma, <span>“A survey on crawlers used in developing search engine,”</span> in <em>2021 5th international conference on intelligent computing and control systems (ICICCS)</em>, IEEE, 2021, pp. 1446–1452.</div>
</div>
<div id="ref-Crawl" class="csl-entry" role="listitem">
<div class="csl-left-margin">[8] </div><div class="csl-right-inline">Octoparse, <span>“Web crawl.”</span> 2024. Available: <a href="https://www.octoparse.com/">https://www.octoparse.com/</a></div>
</div>
<div id="ref-cookintroduction" class="csl-entry" role="listitem">
<div class="csl-left-margin">[9] </div><div class="csl-right-inline">J. J. Cook, <span>“An introduction to quarto: A versatile open-source tool for data reporting and visualization.”</span></div>
</div>
<div id="ref-mati2023eviewsr" class="csl-entry" role="listitem">
<div class="csl-left-margin">[10] </div><div class="csl-right-inline">S. Mati, I. Civcir, and S. I. Abba, <span>“EviewsR: An r package for dynamic and reproducible research using EViews, r, r markdown and quarto.”</span> <em>R Journal</em>, vol. 15, no. 2, 2023.</div>
</div>
<div id="ref-paciorek2023example" class="csl-entry" role="listitem">
<div class="csl-left-margin">[11] </div><div class="csl-right-inline">C. Paciorek, <span>“An example quarto markdown file,”</span> 2023.</div>
</div>
<div id="ref-miroshnychenko2023quarto" class="csl-entry" role="listitem">
<div class="csl-left-margin">[12] </div><div class="csl-right-inline">I. Miroshnychenko, <span>“QUARTO: REVOLUTIONIZING CONTENT CREATION,”</span> <em>Volume editor: Vitaliy Snytyuk, Dr. Sc., Prof. Program Committee: Aldrich Chris, Andreas Pester, Frederic Mallet, Hiroshi Tanaka, Iurii Krak, Yulia Khlevna, Karsten Henke, Oleg Chertov, Oleksandr Kuchanskyi, Oleksandr Marchenko, S<span>á</span>ndor Boz<span>ó</span>ki, Vitaliy Tsyganok, Vladimir Vovk Organizing Committee: Anatoly Anisimov, Vitaliy Snytyuk, Oleksii Bychkov, Oleh Ilarionov, Yuriі</em>, p. 189, 2023.</div>
</div>
<div id="ref-hassan2014improving" class="csl-entry" role="listitem">
<div class="csl-left-margin">[13] </div><div class="csl-right-inline">R. F. Hassan and S. Hussain, <span>“Improving the web indexing quality through a website-search engine coactions,”</span> <em>International Journal of Computer and Information Technology</em>, vol. 3, no. 2, 2014.</div>
</div>
<div id="ref-coe2016website" class="csl-entry" role="listitem">
<div class="csl-left-margin">[14] </div><div class="csl-right-inline">M. Coe, <span>“Website indexing,”</span> <em>The Indexer: The International Journal of Indexing</em>, vol. 34, no. 1, pp. 20–25, 2016.</div>
</div>
</div></section></div></main>
<!-- /main column -->
<script id="quarto-html-after-body" type="application/javascript">
window.document.addEventListener("DOMContentLoaded", function (event) {
const toggleBodyColorMode = (bsSheetEl) => {
const mode = bsSheetEl.getAttribute("data-mode");
const bodyEl = window.document.querySelector("body");
if (mode === "dark") {
bodyEl.classList.add("quarto-dark");
bodyEl.classList.remove("quarto-light");
} else {
bodyEl.classList.add("quarto-light");
bodyEl.classList.remove("quarto-dark");
}
}
const toggleBodyColorPrimary = () => {
const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
if (bsSheetEl) {
toggleBodyColorMode(bsSheetEl);
}
}
toggleBodyColorPrimary();
const icon = "";
const anchorJS = new window.AnchorJS();
anchorJS.options = {
placement: 'right',
icon: icon
};
anchorJS.add('.anchored');
const isCodeAnnotation = (el) => {
for (const clz of el.classList) {
if (clz.startsWith('code-annotation-')) {
return true;
}
}
return false;
}
const onCopySuccess = function(e) {
// button target
const button = e.trigger;
// don't keep focus
button.blur();
// flash "checked"
button.classList.add('code-copy-button-checked');
var currentTitle = button.getAttribute("title");
button.setAttribute("title", "Copied!");
let tooltip;
if (window.bootstrap) {
button.setAttribute("data-bs-toggle", "tooltip");
button.setAttribute("data-bs-placement", "left");
button.setAttribute("data-bs-title", "Copied!");
tooltip = new bootstrap.Tooltip(button,
{ trigger: "manual",
customClass: "code-copy-button-tooltip",
offset: [0, -8]});
tooltip.show();
}
setTimeout(function() {
if (tooltip) {
tooltip.hide();
button.removeAttribute("data-bs-title");
button.removeAttribute("data-bs-toggle");
button.removeAttribute("data-bs-placement");
}
button.setAttribute("title", currentTitle);
button.classList.remove('code-copy-button-checked');
}, 1000);
// clear code selection
e.clearSelection();
}
const getTextToCopy = function(trigger) {
const codeEl = trigger.previousElementSibling.cloneNode(true);
for (const childEl of codeEl.children) {
if (isCodeAnnotation(childEl)) {
childEl.remove();
}
}
return codeEl.innerText;
}
const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
text: getTextToCopy
});
clipboard.on('success', onCopySuccess);
if (window.document.getElementById('quarto-embedded-source-code-modal')) {
// For code content inside modals, clipBoardJS needs to be initialized with a container option
// TODO: Check when it could be a function (https://github.com/zenorocha/clipboard.js/issues/860)
const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
text: getTextToCopy,
container: window.document.getElementById('quarto-embedded-source-code-modal')
});
clipboardModal.on('success', onCopySuccess);
}
var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
var mailtoRegex = new RegExp(/^mailto:/);
var filterRegex = new RegExp('/' + window.location.host + '/');
var isInternal = (href) => {
return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
}
// Inspect non-navigation links and adorn them if external
var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
for (var i=0; i<links.length; i++) {
const link = links[i];
if (!isInternal(link.href)) {
// undo the damage that might have been done by quarto-nav.js in the case of
// links that we want to consider external
if (link.dataset.originalHref !== undefined) {
link.href = link.dataset.originalHref;
}
}
}
function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
const config = {
allowHTML: true,
maxWidth: 500,
delay: 100,
arrow: false,
appendTo: function(el) {
return el.parentElement;
},
interactive: true,
interactiveBorder: 10,
theme: 'quarto',
placement: 'bottom-start',
};
if (contentFn) {
config.content = contentFn;
}
if (onTriggerFn) {
config.onTrigger = onTriggerFn;
}
if (onUntriggerFn) {
config.onUntrigger = onUntriggerFn;
}
window.tippy(el, config);
}
const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
for (var i=0; i<noterefs.length; i++) {
const ref = noterefs[i];
tippyHover(ref, function() {
// use id or data attribute instead here
let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
try { href = new URL(href).hash; } catch {}
const id = href.replace(/^#\/?/, "");
const note = window.document.getElementById(id);
if (note) {
return note.innerHTML;
} else {
return "";
}
});
}
const xrefs = window.document.querySelectorAll('a.quarto-xref');
const processXRef = (id, note) => {
// Strip column container classes
const stripColumnClz = (el) => {
el.classList.remove("page-full", "page-columns");
if (el.children) {
for (const child of el.children) {
stripColumnClz(child);
}
}
}
stripColumnClz(note)
if (id === null || id.startsWith('sec-')) {
// Special case sections, only their first couple elements
const container = document.createElement("div");
if (note.children && note.children.length > 2) {
container.appendChild(note.children[0].cloneNode(true));
for (let i = 1; i < note.children.length; i++) {
const child = note.children[i];
if (child.tagName === "P" && child.innerText === "") {
continue;
} else {
container.appendChild(child.cloneNode(true));
break;
}
}
if (window.Quarto?.typesetMath) {
window.Quarto.typesetMath(container);
}
return container.innerHTML
} else {
if (window.Quarto?.typesetMath) {
window.Quarto.typesetMath(note);
}
return note.innerHTML;
}
} else {
// Remove any anchor links if they are present
const anchorLink = note.querySelector('a.anchorjs-link');
if (anchorLink) {
anchorLink.remove();
}
if (window.Quarto?.typesetMath) {
window.Quarto.typesetMath(note);
}
// TODO in 1.5, we should make sure this works without a callout special case
if (note.classList.contains("callout")) {
return note.outerHTML;
} else {
return note.innerHTML;