From afc1b89ac71b642a841c8b79ea0ec6b70d2adae5 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Fri, 3 Jul 2015 22:12:49 -0700 Subject: [PATCH] Added LaTeX renderer. * New exported function in API: `cmark_render_latex`. * Added src/latex.hs. * Updated README and man page. * Closes #31. --- Makefile | 2 +- README.md | 4 +- man/man1/cmark.1 | 4 +- man/man3/cmark.3 | 8 +- src/CMakeLists.txt | 1 + src/cmark.h | 5 + src/latex.c | 609 +++++++++++++++++++++++++++++++++++++++++++++ src/main.c | 10 +- 8 files changed, 635 insertions(+), 8 deletions(-) create mode 100644 src/latex.c diff --git a/Makefile b/Makefile index f11431c55..ddcd10585 100644 --- a/Makefile +++ b/Makefile @@ -132,7 +132,7 @@ $(ALLTESTS): $(SPEC) leakcheck: $(ALLTESTS) rc=0; \ - for format in html man xml commonmark; do \ + for format in html man xml latex commonmark; do \ for opts in "" "--smart" "--normalize"; do \ echo "cmark -t $$format $$opts" ; \ cat $< | valgrind -q --leak-check=full --dsymutil=yes --error-exitcode=1 $(PROG) -t $$format $$opts >/dev/null || rc=1; \ diff --git a/README.md b/README.md index b9be9b1f1..c0ca22d8f 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ rationalized version of Markdown syntax with a [spec][the spec]. It provides a shared library (`libcmark`) with functions for parsing CommonMark documents to an abstract syntax tree (AST), manipulating -the AST, and rendering the document to HTML, groff man, +the AST, and rendering the document to HTML, groff man, LaTeX, CommonMark, or an XML representation of the AST. It also provides a command-line program (`cmark`) for parsing and rendering CommonMark documents. @@ -43,7 +43,7 @@ Advantages of this library: - **Flexible.** CommonMark input is parsed to an AST which can be manipulated programatically prior to rendering. -- **Multiple renderers.** Output in HTML, groff man, CommonMark, +- **Multiple renderers.** Output in HTML, groff man, LaTeX, CommonMark, and a custom XML format is supported. And it is easy to write new renderers to support other formats. diff --git a/man/man1/cmark.1 b/man/man1/cmark.1 index 8c1c2c7d2..64fa69710 100644 --- a/man/man1/cmark.1 +++ b/man/man1/cmark.1 @@ -10,7 +10,7 @@ file* .SH "DESCRIPTION" \fBcmark\fR converts Markdown formatted plain text to either HTML, groff man, -CommonMark XML, or CommonMark, using the conventions +CommonMark XML, LaTeX, or CommonMark, using the conventions described in the CommonMark spec. It reads input from \fIstdin\fR or the specified files (concatenating their contents) and writes output to \fIstdout\fR. @@ -18,7 +18,7 @@ output to \fIstdout\fR. .TP 12n .B \-\-to, \-t \f[I]FORMAT\f[] Specify output format (\f[C]html\f[], \f[C]man\f[], \f[C]xml\f[], -\f[C]commonmark\f[]). +\f[C]latex\f[], \f[C]commonmark\f[]). .TP 12n .B \-\-width \f[I]WIDTH\f[] Specify a column width to which to wrap the output. For no wrapping, use diff --git a/man/man3/cmark.3 b/man/man3/cmark.3 index 6edac7bea..652112637 100644 --- a/man/man3/cmark.3 +++ b/man/man3/cmark.3 @@ -1,4 +1,4 @@ -.TH cmark 3 "June 25, 2015" "LOCAL" "Library Functions Manual" +.TH cmark 3 "July 03, 2015" "LOCAL" "Library Functions Manual" .SH NAME .PP @@ -479,6 +479,12 @@ Render a \f[I]node\f[] tree as a groff man page, without the header. .PP Render a \f[I]node\f[] tree as a commonmark document. +.PP +\fIchar *\f[] \fBcmark_render_latex\f[](\fIcmark_node *root\f[], \fIint options\f[], \fIint width\f[]) + +.PP +Render a \f[I]node\f[] tree as a LaTeX document. + .PP .nf \fC diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 37a46d3cc..0324fb451 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -32,6 +32,7 @@ set(LIBRARY_SOURCES xml.c html.c commonmark.c + latex.c houdini_href_e.c houdini_html_e.c houdini_html_u.c diff --git a/src/cmark.h b/src/cmark.h index 192290c12..6618301a9 100644 --- a/src/cmark.h +++ b/src/cmark.h @@ -486,6 +486,11 @@ char *cmark_render_man(cmark_node *root, int options); CMARK_EXPORT char *cmark_render_commonmark(cmark_node *root, int options, int width); +/** Render a 'node' tree as a LaTeX document. + */ +CMARK_EXPORT +char *cmark_render_latex(cmark_node *root, int options, int width); + /** Default writer options. */ #define CMARK_OPT_DEFAULT 0 diff --git a/src/latex.c b/src/latex.c new file mode 100644 index 000000000..5eea3a0d3 --- /dev/null +++ b/src/latex.c @@ -0,0 +1,609 @@ +#include +#include +#include +#include +#include + +#include "config.h" +#include "cmark.h" +#include "node.h" +#include "buffer.h" +#include "utf8.h" +#include "scanners.h" + +// Functions to convert cmark_nodes to commonmark strings. + +struct render_state { + int options; + cmark_strbuf* buffer; + cmark_strbuf* prefix; + int column; + int width; + int need_cr; + int enumlevel; + bufsize_t last_breakable; + bool begin_line; + bool no_wrap; + bool in_tight_list_item; + bool silence; +}; + +static inline void cr(struct render_state *state) +{ + if (state->need_cr < 1) { + state->need_cr = 1; + } +} + +static inline void blankline(struct render_state *state) +{ + if (state->need_cr < 2) { + state->need_cr = 2; + } +} + +typedef enum { + LITERAL, + NORMAL, + URL +} escaping; + +static inline void out(struct render_state *state, + cmark_chunk str, + bool wrap, + escaping escape) +{ + unsigned char* source = str.data; + int length = str.len; + unsigned char nextc; + int32_t c; + int i = 0; + int len; + cmark_chunk remainder = cmark_chunk_literal(""); + int k = state->buffer->size - 1; + + if (state->silence) + return; + + wrap = wrap && !state->no_wrap; + + if (state->in_tight_list_item && state->need_cr > 1) { + state->need_cr = 1; + } + while (state->need_cr) { + if (k < 0 || state->buffer->ptr[k] == '\n') { + k -= 1; + } else { + cmark_strbuf_putc(state->buffer, '\n'); + if (state->need_cr > 1) { + cmark_strbuf_put(state->buffer, state->prefix->ptr, + state->prefix->size); + } + } + state->column = 0; + state->begin_line = true; + state->need_cr -= 1; + } + + while (i < length) { + if (state->begin_line) { + cmark_strbuf_put(state->buffer, state->prefix->ptr, + state->prefix->size); + // note: this assumes prefix is ascii: + state->column = state->prefix->size; + } + + len = utf8proc_iterate(source + i, length - i, &c); + if (len == -1) { // error condition + return; // return without rendering rest of string + } + nextc = source[i + len]; + if (c == 32 && wrap) { + if (!state->begin_line) { + cmark_strbuf_putc(state->buffer, ' '); + state->column += 1; + state->begin_line = false; + state->last_breakable = state->buffer->size - + 1; + // skip following spaces + while (source[i + 1] == ' ') { + i++; + } + } + + } else if (c == 10) { + cmark_strbuf_putc(state->buffer, '\n'); + state->column = 0; + state->begin_line = true; + state->last_breakable = 0; + } else if (escape == LITERAL) { + utf8proc_encode_char(c, state->buffer); + state->column += 2; + } else { + switch(c) { + case 123: // '{' + case 125: // '}' + case 35: // '#' + case 37: // '%' + case 38: // '&' + cmark_strbuf_putc(state->buffer, '\\'); + utf8proc_encode_char(c, state->buffer); + state->column += 2; + break; + case 36: // '$' + case 95: // '_' + if (escape == NORMAL) { + cmark_strbuf_putc(state->buffer, '\\'); + } + utf8proc_encode_char(c, state->buffer); + break; + case 45 : // '-' + if (nextc == 45) { // prevent ligature + cmark_strbuf_putc(state->buffer, '\\'); + } + utf8proc_encode_char(c, state->buffer); + break; + case 126: // '~' + if (escape == NORMAL) { + cmark_strbuf_puts(state->buffer, + "\\textasciitilde{}"); + } else { + utf8proc_encode_char(c, state->buffer); + } + break; + case 94: // '^' + cmark_strbuf_puts(state->buffer, + "\\^{}"); + break; + case 92: // '\\' + if (escape == URL) { + // / acts as path sep even on windows: + cmark_strbuf_puts(state->buffer, "/"); + } else { + cmark_strbuf_puts(state->buffer, + "\\textbackslash{}"); + } + break; + case 124: // '|' + cmark_strbuf_puts(state->buffer, + "\\textbar{}"); + break; + case 60: // '<' + cmark_strbuf_puts(state->buffer, + "\\textless{}"); + break; + case 62: // '>' + cmark_strbuf_puts(state->buffer, + "\\textgreater{}"); + break; + case 91: // '[' + case 93: // ']' + cmark_strbuf_putc(state->buffer, '{'); + utf8proc_encode_char(c, state->buffer); + cmark_strbuf_putc(state->buffer, '}'); + break; + case 39: // '\'' + cmark_strbuf_puts(state->buffer, + "\\textquotesingle{}"); + break; + case 160: // nbsp + cmark_strbuf_putc(state->buffer, '~'); + break; + case 8230: // hellip + cmark_strbuf_puts(state->buffer, "\\ldots{}"); + break; + case 8216: // lsquo + if (escape == NORMAL) { + cmark_strbuf_putc(state->buffer, '`'); + } else { + utf8proc_encode_char(c, state->buffer); + } + break; + case 8217: // rsquo + if (escape == NORMAL) { + cmark_strbuf_putc(state->buffer, '\''); + } else { + utf8proc_encode_char(c, state->buffer); + } + break; + case 8220: // ldquo + if (escape == NORMAL) { + cmark_strbuf_puts(state->buffer, "``"); + } else { + utf8proc_encode_char(c, state->buffer); + } + break; + case 8221: // rdquo + if (escape == NORMAL) { + cmark_strbuf_puts(state->buffer, "''"); + } else { + utf8proc_encode_char(c, state->buffer); + } + break; + case 8212: // emdash + if (escape == NORMAL) { + cmark_strbuf_puts(state->buffer, "---"); + } else { + utf8proc_encode_char(c, state->buffer); + } + break; + case 8211: // endash + if (escape == NORMAL) { + cmark_strbuf_puts(state->buffer, "--"); + } else { + utf8proc_encode_char(c, state->buffer); + } + break; + default: + utf8proc_encode_char(c, state->buffer); + state->column += 1; + state->begin_line = false; + } + } + + // If adding the character went beyond width, look for an + // earlier place where the line could be broken: + if (state->width > 0 && + state->column > state->width && + !state->begin_line && + state->last_breakable > 0) { + + // copy from last_breakable to remainder + cmark_chunk_set_cstr(&remainder, (char *) state->buffer->ptr + state->last_breakable + 1); + // truncate at last_breakable + cmark_strbuf_truncate(state->buffer, state->last_breakable); + // add newline, prefix, and remainder + cmark_strbuf_putc(state->buffer, '\n'); + cmark_strbuf_put(state->buffer, state->prefix->ptr, + state->prefix->size); + cmark_strbuf_put(state->buffer, remainder.data, remainder.len); + state->column = state->prefix->size + remainder.len; + cmark_chunk_free(&remainder); + state->last_breakable = 0; + state->begin_line = false; + } + + i += len; + } +} + +static void lit(struct render_state *state, char *s, bool wrap) +{ + cmark_chunk str = cmark_chunk_literal(s); + out(state, str, wrap, LITERAL); +} + +typedef enum { + NO_LINK, + URL_AUTOLINK, + EMAIL_AUTOLINK, + NORMAL_LINK +} link_type; + +static link_type +get_link_type(cmark_node *node) +{ + cmark_chunk *title; + cmark_chunk *url; + cmark_node *link_text; + char *realurl; + int realurllen; + bool isemail = false; + + if (node->type != CMARK_NODE_LINK) { + return NO_LINK; + } + + url = &node->as.link.url; + if (url->len == 0 || scan_scheme(url, 0) == 0) { + return NO_LINK; + } + + title = &node->as.link.title; + // if it has a title, we can't treat it as an autolink: + if (title->len > 0) { + return NORMAL_LINK; + } + + link_text = node->first_child; + cmark_consolidate_text_nodes(link_text); + realurl = (char*)url->data; + realurllen = url->len; + if (strncmp(realurl, "mailto:", 7) == 0) { + realurl += 7; + realurllen -= 7; + isemail = true; + } + if (realurllen == link_text->as.literal.len && + strncmp(realurl, + (char*)link_text->as.literal.data, + link_text->as.literal.len) == 0) { + if (isemail) { + return EMAIL_AUTOLINK; + } else { + return URL_AUTOLINK; + } + } else { + return NORMAL_LINK; + } +} + +// if node is a block node, returns node. +// otherwise returns first block-level node that is an ancestor of node. +static cmark_node* +get_containing_block(cmark_node *node) +{ + while (node && + (node->type < CMARK_NODE_FIRST_BLOCK || + node->type > CMARK_NODE_LAST_BLOCK)) { + node = node->parent; + } + return node; +} + +static int +S_render_node(cmark_node *node, cmark_event_type ev_type, + struct render_state *state) +{ + cmark_node *tmp; + cmark_chunk *code; + int list_number; + char list_number_string[20]; + bool entering = (ev_type == CMARK_EVENT_ENTER); + cmark_list_type list_type; + cmark_chunk list_name; + cmark_chunk url; + const char* roman_numerals[] = { "", "i", "ii", "iii", "iv", "v", + "vi", "vii", "viii", "ix", "x" }; + + // Don't adjust tight list status til we've started the list. + // Otherwise we loose the blank line between a paragraph and + // a following list. + if (!(node->type == CMARK_NODE_ITEM && node->prev == NULL && + entering)) { + tmp = get_containing_block(node); + state->in_tight_list_item = + (tmp->type == CMARK_NODE_ITEM && + cmark_node_get_list_tight(tmp->parent)) || + (tmp && + tmp->parent && + tmp->parent->type == CMARK_NODE_ITEM && + cmark_node_get_list_tight(tmp->parent->parent)); + } + + switch (node->type) { + case CMARK_NODE_DOCUMENT: + if (!entering) { + cmark_strbuf_putc(state->buffer, '\n'); + } + break; + + case CMARK_NODE_BLOCK_QUOTE: + if (entering) { + lit(state, "\\begin{quote}", false); + cr(state); + } else { + lit(state, "\\end{quote}", false); + blankline(state); + } + break; + + case CMARK_NODE_LIST: + list_type = cmark_node_get_list_type(node); + list_name = cmark_chunk_literal( + list_type == CMARK_ORDERED_LIST ? + "enumerate" : "itemize"); + if (entering) { + if (list_type == CMARK_ORDERED_LIST) { + state->enumlevel++; + } + lit(state, "\\begin{", false); + out(state, list_name, false, false); + lit(state, "}", false); + cr(state); + list_number = cmark_node_get_list_start(node); + if (list_number > 1) { + snprintf(list_number_string, 19, + "%d", list_number); + lit(state, "\\setcounter{enum", false); + lit(state, (char *)roman_numerals[state->enumlevel], + false); + lit(state, "}{", false); + out(state, + cmark_chunk_literal(list_number_string), + false, NORMAL); + lit(state, "}", false); + cr(state); + } + } else { + if (list_type == CMARK_ORDERED_LIST) { + state->enumlevel--; + } + lit(state, "\\end{", false); + out(state, list_name, false, false); + lit(state, "}", false); + blankline(state); + } + cmark_chunk_free(&list_name); + break; + + case CMARK_NODE_ITEM: + if (entering) { + lit(state, "\\item ", false); + } else { + cr(state); + } + break; + + case CMARK_NODE_HEADER: + if (entering) { + switch (cmark_node_get_header_level(node)) { + case 1: + lit(state, "\\section", false); + break; + case 2: + lit(state, "\\subsection", false); + break; + case 3: + lit(state, "\\subsubsection", false); + break; + case 4: + lit(state, "\\paragraph", false); + break; + case 5: + lit(state, "\\subparagraph", false); + break; + } + lit(state, "{", false); + } else { + lit(state, "}", false); + blankline(state); + } + break; + + case CMARK_NODE_CODE_BLOCK: + cr(state); + lit(state, "\\begin{verbatim}", false); + cr(state); + code = &node->as.code.literal; + out(state, node->as.code.literal, false, LITERAL); + cr(state); + lit(state, "\\end{verbatim}", false); + blankline(state); + break; + + case CMARK_NODE_HTML: + break; + + case CMARK_NODE_HRULE: + blankline(state); + lit(state, "\\begin{center}\\rule{0.5\\linewidth}{\\linethickness}\\end{center}", false); + blankline(state); + break; + + case CMARK_NODE_PARAGRAPH: + if (!entering) { + blankline(state); + } + break; + + case CMARK_NODE_TEXT: + out(state, node->as.literal, true, NORMAL); + break; + + case CMARK_NODE_LINEBREAK: + lit(state, "\\\\", false); + cr(state); + break; + + case CMARK_NODE_SOFTBREAK: + if (state->width == 0) { + cr(state); + } else { + lit(state, " ", true); + } + break; + + case CMARK_NODE_CODE: + lit(state, "\\texttt{", false); + out(state, node->as.literal, false, NORMAL); + lit(state, "}", false); + break; + + case CMARK_NODE_INLINE_HTML: + break; + + case CMARK_NODE_STRONG: + if (entering) { + lit(state, "\\strong{", false); + } else { + lit(state, "}", false); + } + break; + + case CMARK_NODE_EMPH: + if (entering) { + lit(state, "\\emph{", false); + } else { + lit(state, "}", false); + } + break; + + case CMARK_NODE_LINK: + if (entering) { + url = cmark_chunk_literal(cmark_node_get_url(node)); + switch(get_link_type(node)) { + case URL_AUTOLINK: + lit(state, "\\url{", false); + out(state, url, false, URL); + break; + case EMAIL_AUTOLINK: + lit(state, "\\href{", false); + out(state, url, false, URL); + lit(state, "}\\nolinkurl{", false); + break; + case NORMAL_LINK: + lit(state, "\\href{", false); + out(state, url, false, URL); + lit(state, "}{", false); + break; + case NO_LINK: + lit(state, "{", false); // error? + } + } else { + lit(state, "}", false); + } + + break; + + case CMARK_NODE_IMAGE: + if (entering) { + url = cmark_chunk_literal(cmark_node_get_url(node)); + lit(state, "\\protect\\includegraphics{", false); + out(state, url, false, URL); + lit(state, "}", false); + state->silence = true; // don't print the alt text + } else { + state->silence = false; + } + break; + + default: + assert(false); + break; + } + + return 1; +} + +char *cmark_render_latex(cmark_node *root, int options, int width) +{ + char *result; + cmark_strbuf commonmark = GH_BUF_INIT; + cmark_strbuf prefix = GH_BUF_INIT; + if (CMARK_OPT_HARDBREAKS & options) { + width = 0; + } + struct render_state state = { + options, &commonmark, &prefix, 0, width, + 0, 0, 0, true, false, false, false + }; + cmark_node *cur; + cmark_event_type ev_type; + cmark_iter *iter = cmark_iter_new(root); + + while ((ev_type = cmark_iter_next(iter)) != CMARK_EVENT_DONE) { + cur = cmark_iter_get_node(iter); + if (!S_render_node(cur, ev_type, &state)) { + // a false value causes us to skip processing + // the node's contents. this is used for + // autolinks. + cmark_iter_reset(iter, cur, CMARK_EVENT_EXIT); + } + } + result = (char *)cmark_strbuf_detach(&commonmark); + + cmark_strbuf_free(&prefix); + cmark_iter_free(iter); + return result; +} diff --git a/src/main.c b/src/main.c index 5dc97b2fd..e20b0dc57 100644 --- a/src/main.c +++ b/src/main.c @@ -17,14 +17,15 @@ typedef enum { FORMAT_HTML, FORMAT_XML, FORMAT_MAN, - FORMAT_COMMONMARK + FORMAT_COMMONMARK, + FORMAT_LATEX } writer_format; void print_usage() { printf("Usage: cmark [FILE*]\n"); printf("Options:\n"); - printf(" --to, -t FORMAT Specify output format (html, xml, man, commonmark)\n"); + printf(" --to, -t FORMAT Specify output format (html, xml, man, commonmark, latex)\n"); printf(" --width WIDTH Specify wrap width (default 0 = nowrap)\n"); printf(" --sourcepos Include source position attribute\n"); printf(" --hardbreaks Treat newlines as hard line breaks\n"); @@ -52,6 +53,9 @@ static void print_document(cmark_node *document, writer_format writer, case FORMAT_COMMONMARK: result = cmark_render_commonmark(document, options, width); break; + case FORMAT_LATEX: + result = cmark_render_latex(document, options, width); + break; default: fprintf(stderr, "Unknown format %d\n", writer); exit(1); @@ -125,6 +129,8 @@ int main(int argc, char *argv[]) writer = FORMAT_XML; } else if (strcmp(argv[i], "commonmark") == 0) { writer = FORMAT_COMMONMARK; + } else if (strcmp(argv[i], "latex") == 0) { + writer = FORMAT_LATEX; } else { fprintf(stderr, "Unknown format %s\n", argv[i]);