From 4ac42ae365008f3d834bbde52cc7dee9b704d137 Mon Sep 17 00:00:00 2001 From: Brion Date: Fri, 30 Dec 2022 13:52:45 +0530 Subject: [PATCH] chore: flip files --- .editorconfig | 8 +- .eslintignore | 2 + .eslintrc.cjs | 43 + .eslintrc.json | 9 - .gitignore | 6 + .prettierignore | 2 + .prettierrc | 4 - .travis.yml | 21 - README.md | 187 +- bin/medium2mdx | 4 + index.js | 49 - lib/converter.js | 530 +++ lib/get-feed.js | 23 - lib/get-post.js | 206 - lib/index.js | 77 + lib/logger.js | 8 + lib/markdown.js | 175 + lib/press.js | 60 + lib/utils.js | 261 -- package.json | 100 +- pnpm-lock.yaml | 3297 +++++++++++++++ prettier.config.cjs | 25 + templates/default.js | 33 + templates/my-template.js | 39 + test/get-feed-test.js | 6 - test/get-post-test.js | 14 - test/testpost.json | 1 - test/utils-test.js | 35 - yarn.lock | 8209 -------------------------------------- 29 files changed, 4502 insertions(+), 8932 deletions(-) create mode 100644 .eslintignore create mode 100644 .eslintrc.cjs delete mode 100644 .eslintrc.json mode change 100644 => 100755 .gitignore create mode 100644 .prettierignore delete mode 100644 .prettierrc delete mode 100644 .travis.yml create mode 100755 bin/medium2mdx delete mode 100755 index.js create mode 100755 lib/converter.js delete mode 100644 lib/get-feed.js delete mode 100644 lib/get-post.js create mode 100644 lib/index.js create mode 100644 lib/logger.js create mode 100644 lib/markdown.js create mode 100644 lib/press.js delete mode 100644 lib/utils.js create mode 100644 pnpm-lock.yaml create mode 100644 prettier.config.cjs create mode 100644 templates/default.js create mode 100755 templates/my-template.js delete mode 100644 test/get-feed-test.js delete mode 100644 test/get-post-test.js delete mode 100644 test/testpost.json delete mode 100644 test/utils-test.js delete mode 100644 yarn.lock diff --git a/.editorconfig b/.editorconfig index 3dce414..6e87a00 100644 --- a/.editorconfig +++ b/.editorconfig @@ -1,9 +1,13 @@ +# Editor configuration, see http://editorconfig.org root = true [*] charset = utf-8 indent_style = space indent_size = 2 -end_of_line = lf insert_final_newline = true -trim_trailing_whitespace = true \ No newline at end of file +trim_trailing_whitespace = true + +[*.md] +max_line_length = off +trim_trailing_whitespace = false diff --git a/.eslintignore b/.eslintignore new file mode 100644 index 0000000..739f6a5 --- /dev/null +++ b/.eslintignore @@ -0,0 +1,2 @@ +medium-export +output diff --git a/.eslintrc.cjs b/.eslintrc.cjs new file mode 100644 index 0000000..2c62315 --- /dev/null +++ b/.eslintrc.cjs @@ -0,0 +1,43 @@ +/** + * MIT License + * + * Copyright (c) 2022, Brion Mario. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +const path = require('path'); + +module.exports = { + env: { + node: true, + es6: true, + }, + plugins: ['@brionmario'], + extends: [ + 'plugin:@brionmario/strict', + 'plugin:@brionmario/internal', + 'plugin:@brionmario/jest', + 'plugin:@brionmario/prettier', + ], + parserOptions: { + ecmaVersion: 2018, + project: [path.resolve(__dirname, 'tsconfig.json')], + }, +}; diff --git a/.eslintrc.json b/.eslintrc.json deleted file mode 100644 index be8a38e..0000000 --- a/.eslintrc.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "parserOptions": { - "ecmaVersion": 2018 - }, - "plugins": ["prettier"], - "rules": { - "prettier/prettier": "error" - } -} diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 index 64742f9..92e5e6e --- a/.gitignore +++ b/.gitignore @@ -90,3 +90,9 @@ dist # OS Specific files .DS_Store + +# I'm using /output folder to export the markdown files. +output + +# I'm using /medium-export folder to keep the contents of the folder downloaded from the medium profile. +medium-export diff --git a/.prettierignore b/.prettierignore new file mode 100644 index 0000000..739f6a5 --- /dev/null +++ b/.prettierignore @@ -0,0 +1,2 @@ +medium-export +output diff --git a/.prettierrc b/.prettierrc deleted file mode 100644 index b2095be..0000000 --- a/.prettierrc +++ /dev/null @@ -1,4 +0,0 @@ -{ - "semi": false, - "singleQuote": true -} diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 5e4a9ea..0000000 --- a/.travis.yml +++ /dev/null @@ -1,21 +0,0 @@ -language: node_js -notifications: - email: false -cache: - directories: - - node_modules -node_js: - - '10' - - '8' -install: - - travis_retry npm install -script: - - npm run test -after_success: - - npm run codecov -jobs: - include: - - stage: release - node_js: lts/* - script: - - npm run semantic-release diff --git a/README.md b/README.md index 917d7a4..8f466fd 100644 --- a/README.md +++ b/README.md @@ -1,38 +1,173 @@ -[![Build Status](https://travis-ci.org/xdamman/mediumexporter.svg?branch=master)](https://travis-ci.org/xdamman/mediumexporter) -[![Coverage Status](https://coveralls.io/repos/github/xdamman/mediumexporter/badge.svg?branch=master)](https://coveralls.io/github/xdamman/mediumexporter?branch=master) +# medium-to-gatsby -# Medium Exporter + A CLI to convert your medium exported .html files to gatsby .md files. + + ## Features +- Converts medium .html files and outputs gatsby .md files. +- Customize output via templates +- Downloads post images from medium and saves locally +- Handles embedded tweets +- Inlines github gists +- Allows default language for code blocks. +- Skips over drafts and post replies. +- Generates report when done. -Export your stories published on medium.com to markdown. +## Installation +`$ npm install -g https://github.com/jamischarles/export-medium-to-gatsby` (maybe it'll go on npm eventually) -## Usage +## Steps +1. [Download your medium posts as an archive from medium](https://help.medium.com/hc/en-us/articles/115004745787-Download-your-information). +2. Install this CLI via #Installation step above +3. Save a template file (see template section below) where you'll be running + your export command. +4. Customize the template.js file you downloaded to match the [frontmatter](https://jekyllrb.com/docs/front-matter/) fields your gatsby blog requires. Here you also define what folder in your blog medium images should be downloaded to. +5. Run the CLI and use the `medium-export/posts` folder as the input, and for output either directly output to your `content/posts` folder, or copy it there after generating the files. +6. Verify that the generated files are correct and looks good. +7. Make any CSS and styling adjustments as needed. +8. Do a happy dance. - ./index.js {url} - -O, --output - write to specified output directory - -I, --info – Show information about the medium post - --hugo - enable gohugo.io shortcodes - --frontmatter - enable frontmatter - --jekyll - format content and images for us in Jekyll blogs +## CLI Usage +``` + Usage + $ medium2gatsby -## CLI example + Options + --output, -o Destination folder for output files. Defaults to './'. + --template, -t Template used to generate post files. + --help, -h Shows usage instructions -If not output directory is specified, images and content will be downloaded into `/content` + Examples + $ medium2gatsby . -o posts -t template.js + $ medium2gatsby 2018-04-02_Introducing-the-react-testing-library----e3a274307e65.html -o output -t template.js + $ medium2gatsby ~/Downloads/medium-export/posts -o . -t template.js +``` - ./index.js https://medium.com/@PatrickHeneise/malaysia-16be98ab673e +## Recommended styling +### Images and subtitles +Images and subtitles will been converted to +`
Subtitle
` same as medium used. -## programmatic example +Gatsby injects a `

` in there. To fix spacing I suggest you add the following to your template's +CSS:`figure > p {margin-bottom:0px !important;}`. -### get individual posts +You can use `figure figcaption {}` to style image subtitles. - async function example() { - mediumexporter.getPost(link, { - output: "content/posts", - hugo: true, - frontmatter: true - }) - } +### Fenced Links +If you used fenced links on medium to make links stand out, those links will now +have show up as `some text.` This +CSS should approximate the medium fenced link style: +```css +.fenced-link { + background-color:#0000000d; + font-family:monospace; + text-decoration:underline; + padding:2px; +} +``` -### get feeds (default page size is 10) - const exporter = require('./index') - exporter.getFeed('https://medium.com/feed/@xdamman', { output: 'content' }) + +## Customize via templates +Based on which gatsby theme you're using you may need to generate different +frontmatter fields. + +Here are some example `template.js` you can save and pass to the CLI via the `-t` flag. + +### Template ex1: Different folder for each post +- specifies `2018-04-16` date format in frontmatter `date` field +- generates a separate folder for each post ie: `content/posts/introducing-react/index.md +- saves post images to `/images2` (relative to the post folder) +- posts will show on site as `/posts/[slug-name]` +- defauls all code fences to use `'js'` + +```js +module.exports = { + render: function(data) { + // data.published is Date ISO format: 2018-04-16T14:48:00.000Z + var date = new Date(data.published); + var prettyDate = + date.getFullYear() + + '-' + + (date.getMonth() + 1).toString().padStart(2, 0) + + '-' + + date + .getDate() + .toString() + .padStart(2, 0); //2018-04-16 + + var template = `\ +--- +slug: "/posts/${data.titleForSlug}/" +date: ${prettyDate} +title: "${data.title}" +draft: false +description: "${data.description}" +categories: [] +keywords: [${data.tags.join(',')}] +--- + +${data.body} +`; + + return template; + }, + getOptions: function() { + return { + folderForEachSlug: true, // separate folder for each blog post, where index.md and post images will live + imagePath: '/images2', // . Used in the markdown files. + defaultCodeBlockLanguage: 'js', // code fenced by default will be ``` with no lang. If most of your code blocks are in a specific lang, set this here. + }; + }, +}; + +``` + +### Template ex2: Same folder for all posts +- specifies `2018-04-16T14:48:00.000Z` date format (ISO, which is default) in frontmatter `date` field +- saves all generated posts to same folder defined in `-o` options for CLI. Files are named via slug name from medium. +- saves post images to `/Users/jacharles/dev/blog/content/posts/introducing-the-react-testing-library/images` +- defauls all code fences to use `''` (no language). + +```js +module.exports = { + render: function(data) { + var template = `\ +--- +slug: ${data.titleForSlug} +date: ${data.published} +title: "${data.title}" +template: "post" +draft: false +description: "${data.description}" +category: "" +tags: [${data.tags.join(',')}] +--- + +${data.body} +`; + + return template; + }, + getOptions: function() { + return { + folderForEachSlug: false, // same folder for all posts + imagePath: '/media', // . Used in the markdown files. + // This field is ignored when folderForEachSlug:true. Should be absolute. Location where medium images will be saved. + imageFolder: + '/Users/jacharles/dev/blog/static/media', + defaultCodeBlockLanguage: '', // code fenced by default will be ``` with no lang. If most of your code blocks are in a specific lang, set this here. + }; + }, +}; + +``` + +## TODO and Help needed +I'm about ready to move on from this, but would love help with the following if +anybody feels inclined: +- [ ] Better progress / error messages. Should notify which articles fail for whichever reason +- [ ] Error handling is very lacking in many places. Could / should be improved to be more robust especially around downloading posts / images from medium. +- [ ] Adding tests (prefer something dead simple like mocha). Currently there + are zero tests. +- [ ] More generator targets. This repo could fairly easily be forked and expanded to include other targets like jekyll, or + other static site generators. (low priority) (medium2markdown) diff --git a/bin/medium2mdx b/bin/medium2mdx new file mode 100755 index 0000000..0813745 --- /dev/null +++ b/bin/medium2mdx @@ -0,0 +1,4 @@ +#!/usr/bin/env node + +//executable CLI entry point into this util +require('../lib/index.js'); diff --git a/index.js b/index.js deleted file mode 100755 index 526cd45..0000000 --- a/index.js +++ /dev/null @@ -1,49 +0,0 @@ -#! /usr/bin/env node - -if (require.main !== module) { - module.exports = { - getFeed: require('./lib/get-feed'), - getPost: require('./lib/get-post') - } -} else { - const program = require('commander') - const package = require('./package.json') - - program - .version(package.version) - .description(package.description) - .usage('[options] ') - .option('-I, --info', 'Show information about the medium post') - .option( - '-O, --output ', - 'File (if URL is a post) or directory (if URL is a feed) to output to' - ) - .option('--hugo', 'use gohugo.io specific shortcodes') - .option( - '--jekyll', - 'prefix the files with date, place images in an assets/' - ) - .option('--frontmatter', 'enable markdown frontmatter') - .option('-d, --debug', 'Show debugging info') - .on('--help', function() { - console.log(' Examples:') - console.log('') - console.log(' $ mediumexporter -O content mediumurl') - console.log(' $ mediumexporter --hugo mediumurl') - console.log('') - }) - - program.parse(process.argv) - - var mediumURL = program.args[0] - - if (!mediumURL) { - console.log('missing medium url') - } - - if (mediumURL.match(/medium\.com\/feed\//)) { - require('./lib/get-feed')(mediumURL, program) - } else { - require('./lib/get-post')(mediumURL, program) - } -} diff --git a/lib/converter.js b/lib/converter.js new file mode 100755 index 0000000..550e962 --- /dev/null +++ b/lib/converter.js @@ -0,0 +1,530 @@ +// FIXME: add file description... +var fakeUa = require('fake-useragent'); +var request = require('request'); +var fs = require('fs'); +var path = require('path'); +var cheerio = require('cheerio'); +var util = require('util'); +var mkdirp = require('mkdirp'); +var logger = require('./logger'); +var markdownUtils = require('./markdown'); +var press = require('./press'); + +var makeRequest = util.promisify(request.get); + +// FIXME: Figure out a better way to identify replies. +var KNOWN_POSTS_TO_SKIP = [ + 'Hi Rohan,', + 'IS is blocking CORS requests by default.', + 'Nice catch. I\'ve fixed the typo. Thanks a lot 😀' +] + +var ImageStorageStrategies = Object.freeze({ + LOCAL: 'LOCAL' +}) + +// global state. FIXME: consider localizing this more... +var report = { + posts: { + attempted: [], + succeeded: [], + failed: [], + drafts: [], + replies: [], + }, + gists: { + attempted: [], + succeeded: [], + failed: [], + }, + images: { + attempted: [], + succeeded: [], + failed: [], + }, +}; + +// handle promise errors +process.on('unhandledRejection', up => { + console.log('err', up); + // throw up; +}); + +function convertToSlug(Text) +{ + return Text + .toLowerCase() + .replace(/ /g,'-') + .replace(/[^\w-]+/g,'') + ; +} + +// primary entry point +async function convertMediumFile(filePath, outputFolder, templatePath, export_drafts) { + press.print('Converting: ', true, true); + press.printItem(`PATH: ${filePath}`); + + var template = require(templatePath); + var options = template.getOptions(); + + // don't process drafts + var filename = path.basename(filePath, '.html'); + + press.printItem(`FILE: ${filename}`, false, true); + + if (filename.startsWith('draft')) { + // console.log('Skipping over draft file ', filePath); + report.posts.drafts.push(filePath); + // throw 'draft file'; // equivalent of promise.reject + // if we don't want to export drafts then bail + if (!export_drafts) { + press.printItem(`This is a Draft. Export draft feature was not set. Hence, skipping...`, false, false, 1); + return + }; + } + + report.posts.attempted.push(filePath); + + var srcFilepath = filePath; + var content = fs.readFileSync(filePath); + + try { + var postData = await gatherPostData(content, options, srcFilepath); + postData.draft = export_drafts + + var imageFolder = path.resolve(options.imagePath); + var output = template.render(postData); + + // if true, make folder for each slug, and name it '[slug]/index.md' + if (options.folderForEachSlug) { + outputFolder = path.join(outputFolder, postData.titleForSlug); + imageFolder = path.join(outputFolder, options.imagePath); + filePath = 'index'; + } + + // make outputFolder if it doesn't exist yet + mkdirp.sync(outputFolder); + + // console.log( + // `processing: ${srcFilepath} -> ${path.join(outputFolder, filePath)}.md`, + // ); + + try { + // render post file to folder + writePostToFile(output, filePath, outputFolder); + } catch (e) { + press.printItem(`Successfully wrote the post to: ${output}`, false, false, 3); + } + + if (options.imageStorageStrategy === ImageStorageStrategies.LOCAL) { + try { + // save post images to the local image folder + await saveImagesToLocal(imageFolder, postData.images); + } catch (e) { + press.printItem(`An error occurred while saving the images to local directory. Blog: ${postData.titleForSlug}`, false, false, 2); + } + } + + report.posts.succeeded.push(filePath); + } catch (err) { + // reject(err); + // re-throw if you want it to bubble up + if (err.type != 'silent') throw err; + } + // }); +} + +async function gatherPostData(content, options, filePath) { + press.printItem(`Gathering post data started: ${filePath}`, false, false, 0); + + var $ = cheerio.load(content); + + try { + await inlineGists($); + } catch(e) { + press.printItem(`An error occurred while inlining Gists: ${filePath}`, false, false, 2); + } + + var filename = path.basename(filePath, '.html'); + var is_draft = filename.startsWith('draft'); + var blogTitle = $(".graf--leading").first().text(); + + if (KNOWN_POSTS_TO_SKIP.some((post) => blogTitle.startsWith(post))) { + press.printItem(`This is a reply, not a standalone post. Hence, skipping...`, false, false, 1); + report.posts.replies.push(filePath); + // FIXME: consider setting type of err and then ignoring it at the higher level + throw new SilentError('reply post. Skip over this one: ' + titleForSlug); + } + + // TODO: add no match condition... + if (!is_draft){ + var canonicalLink = $('.p-canonical').attr('href'); + var match = canonicalLink.match( + /https:\/\/medium\.com\/.+\/(.+)-[a-z0-9]+$/i, + ); + var titleForSlug = match[1]; + } else { + // construct a canonical link + var canonicalLink = $('footer > p > a').attr('href'); + var titleForSlug = convertToSlug(blogTitle) + } + + // This will get the image urls, and rewrite the src in the content + var imagesToSave = getMediumImages(options.imageStorageStrategy, $, options.imagePath, titleForSlug); + + var subtitle = $('section.p-summary').text(); + + // $2 is for the post on medium instead of the local file... + var postBody = await scrapeMetaDetailsFromPost(canonicalLink); + + // check if standalone post or reply + var isReplyPost = postBody.match(/inResponseToPostId":"[0-9a-z]+"/); // this is in markup for reply posts + + if (isReplyPost) { + press.printItem(`This is a reply, not a standalone post. Hence, skipping...`, false, false, 1); + report.posts.replies.push(filePath); + // FIXME: consider setting type of err and then ignoring it at the higher level + throw new SilentError('reply post. Skip over this one: ' + titleForSlug); + } + + var $2 = cheerio.load(postBody); + var description = $2('meta[name=description]').attr('content'); // from page... + + var schemaTags = $2('script[type="application/ld+json"]'); + + var metaData = JSON.parse(schemaTags[0].children[0].data); + + var tags = getTags(metaData.keywords); + + var title = $('h1').text(); + + // FIXME: put this in fn + // REMOVE h1 and avatar section + $('h1') + .next() + .remove(); // remove div avatar domEl right after h1 + $('h1').remove(); + + // process code blocks + // medium exports inline code block as and multi-line as


+  // We need to wrap the content of the 
 with  tags so turndown parser won't escape the codeblock content
+  $('pre').map(function(i, el) {
+    var codeBlockContent = $(this).html();
+    codeBlockContent = `${codeBlockContent}`;
+
+    var newEl = $(this).html(codeBlockContent);
+    return newEl;
+  });
+
+  // embedded tweets:
+  // medium returns empty  which turndown throws out before we can process it.
+  // add dummy link text so turndown won't discard it
+  $('blockquote.twitter-tweet a').text('[Embedded tweet]');
+
+  let posts = null;
+  try {
+    posts = convertHtmlToMarkdown($('.section-content').html(), options)
+  } catch(e) {
+    press.printItem(`convertHtmlToMarkdown...`, false, false, 2);
+  }
+
+  var post = {
+    title: title,
+    description: description,
+    subtitle: subtitle,
+    published: $('time').attr('datetime'),
+    bodyRaw: $('.section-content').html(),
+    titleForSlug: titleForSlug,
+    tags: tags,
+    images: imagesToSave, // data for images from the medium post
+    body: posts
+  };
+
+  return post;
+}
+
+// takes array of strings
+function getTags(arr) {
+  var tags = [];
+
+  // FIXME: This had to be added.
+  return [];
+
+  // only take format of 'Tag:JavaScript', and keep latter portion
+  arr.forEach(item => {
+    if (item.startsWith('Tag:')) {
+      tags.push(item.split(':')[1]);
+    }
+  });
+
+  return tags;
+}
+
+var suffix = /\.html$/i;
+
+// FIXME: get name from date + slug
+function writePostToFile(content, oldFilePath, outputFolder) {
+  var fileName = path.basename(oldFilePath, '.html');
+
+  var newPath = path.resolve(path.join(outputFolder, fileName) + '.md');
+
+  // console.log('newPath', newPath);
+  fs.writeFileSync(newPath, content);
+}
+
+// convert the post body
+function convertHtmlToMarkdown(html, templateOptions) {
+  return markdownUtils.transformHtmlToMarkdown(html, templateOptions);
+}
+
+async function scrapeMetaDetailsFromPost(url) {
+  var headers = {
+    'User-Agent': fakeUa(),
+  };
+
+  // FIXME: add error handling conditions...
+  var resp = await makeRequest({url: url, headers: headers});
+  return resp.body;
+}
+
+// attempts to take gist script tags, then downloads the raw content, and places in 
 tag which will be converted to
+// fenced block (```) by turndown
+async function inlineGists($) {
+  // get all script tags on thet page
+  // FIXME: can do away with promises here entirely?
+  var promises = [];
+
+  $('script').each(async function(i, item) {
+    var prom = new Promise(async (resolve, reject) => {
+      var src = $(this).attr('src');
+      var isGist = src.includes('gist');
+      if (isGist) {
+        try {
+          // console.log('feching raw gist source for: ', src);
+          report.gists.attempted.push(src);
+          var rawGist = await getRawGist(src);
+          report.gists.succeeded.push(src);
+
+          // replace rawGist in markup
+          // FIXME: just modify this in turndown?
+          var inlineCode = $(`
${rawGist}
`); //this turns into ``` codefence + + // FIXME: guard to ensure
parent is removed + // Replace the
parent node with code fence + $(this) + .parent() + .replaceWith(inlineCode); + + resolve(); + } catch (e) { + report.gists.failed.push(src); + reject(e); + } + } + }); + promises.push(prom); + }); + + return await Promise.all(promises); +} + +// get the raw gist from github +async function getRawGist(gistUrl) { + var newUrl = gistUrl.replace('github.com', 'githubusercontent.com'); + + // remove suffix (like .js) (maybe use it for code fencing later...) + // FIXME: this is hacky + var gistID = newUrl.split('/')[4]; // FIXME: guard for error + if (gistID.includes('.')) { + var ext = path.extname(gistID); + newUrl = newUrl.replace(ext, ''); // srip extension (needed for raw fetch to work) + } + + newUrl += '/raw'; + + // make the call + var resp = await makeRequest({url: newUrl}); + if (resp.statusCode === 200) { + return resp.body; + } +} + +// returns urls of images to download and re-writes post urls to point locally +function getMediumImages(imageStorageStrategy, $, imageBasePath, postSlug) { + var images = []; + + $('img.graf-image').each(async function(i, item) { + var imageName = $(this).attr('data-image-id'); + var ext = path.extname(imageName); + + // get max resolution of image + var imgUrl = `https://cdn-images-1.medium.com/max/2600/${imageName}`; + + var localImageName = `${postSlug}-${i}${ext}`; // some-post-name-01.jpg + var localImagePath = path.join(imageBasePath, localImageName); // full path including folder + + var imgData = { + mediumUrl: imgUrl, + localName: localImageName, + localPath: localImagePath, // local path including filename we'll save it as + }; + + images.push(imgData); + + // Rewrite img urls in post if the storage strategy is local. + if (imageStorageStrategy === ImageStorageStrategies.LOCAL) { + $(this).attr('src', localImagePath); + } + }); + + return images; +} + +async function saveImagesToLocal(imageFolder, images) { + var imagePromises = images.map(function(image) { + return new Promise(function(resolve, reject) { + var filePath = path.join(imageFolder, image.localName); + mkdirp.sync(imageFolder); // fs.writeFileSync(p, images[0].binary, 'binary'); + + press.printItem(`Downloading image : ${image.mediumUrl} -> ${filePath}`, false, false, 0); + report.images.attempted.push(image.mediumUrl); + // request(image.mediumUrl).pipe(fs.createWriteStream(filePath)); // request image from medium CDN and save locally. TODO: add err handling + + var writer = fs.createWriteStream(filePath); + + request + .get(image.mediumUrl) + .on('complete', function(response) { + // FIXME: how do we measure success / failure here? + report.images.succeeded.push(`${image.mediumUrl}->${filePath}`); + resolve(response); + }) + .on('error', function(err) { + console.log(err); + press.printItem(`An error occurred while downloading image : ${image.mediumUrl} -> ${filePath}`, false, false, 2); + report.images.failed.push(`${image.mediumUrl}->${filePath}`); + reject(err); + }) + .pipe(writer); + }); + }); + + return await Promise.all(imagePromises); +} + +// using this allows us to stop flow execution, but not throw all the way up the chain... +class SilentError extends Error { + constructor(...args) { + super(...args); + Error.captureStackTrace(this, SilentError); + this.type = 'silent'; + } +} + +function printPrettyReport() { + var postsAttempted = report.posts.attempted.length; + var postsSucceeded = report.posts.succeeded.length; + var postsFailed = report.posts.failed.length; + var postsFailedDetail = report.posts.failed; + var postDrafts = report.posts.drafts.length; + var postReplies = report.posts.replies.length; + + var imagesAttempted = report.images.attempted.length; + var imagesSucceeded = report.images.succeeded.length; + var imagesFailed = report.images.failed.length; + var imagesFailedDetail = report.images.failed; + + var gistAttempted = report.gists.attempted.length; + var gistSucceeded = report.gists.succeeded.length; + var gistFailed = report.gists.failed.length; + var gistFailedDetail = report.gists.failed; + + console.log('##############################################################'); + console.log('CONVERSION METRICS'); + console.log('posts attempted', postsAttempted); + console.log('posts succeeded', postsSucceeded); + console.log('posts replies that were ignored:', postReplies); + console.log('posts drafts that were not attempted:', postDrafts); + console.log('posts failed', postsFailed); + console.log('Failed posts:', postsFailedDetail); + console.log(''); + + console.log('medium images attempted', imagesAttempted); + console.log('images succeeded', imagesSucceeded); + console.log('images failed', imagesFailed); + console.log('Failed images:', imagesFailedDetail); + console.log(''); + + console.log('gists inlining attempted', gistAttempted); + console.log('gists succeeded', gistSucceeded); + console.log('gists failed', gistFailed); + console.log('Failed gists:', gistFailedDetail); + + console.log('##############################################################'); +} + +function saveReportToFile(outputFolder) { + fs.writeFileSync( + path.join(outputFolder, 'conversion_report.json'), + JSON.stringify(report), + ); +} + +// writePostFile(metaTemplate); +module.exports = { + convert: async function(srcPath, outputFolder = '.', templatePathStr, export_drafts) { + press.announceCheckpoint('🐱 Started converting.'); + + var isDir = fs.lstatSync(srcPath).isDirectory(); + // TODO: This is un-used. + var isFile = fs.lstatSync(srcPath).isFile(); + + var defaultTemplate = path.resolve( + path.join(__dirname, '../templates/default.js'), + ); + + var templatePath = defaultTemplate; + // if template passed in, load that instead of default + if (templatePathStr) { + templatePath = path.resolve(templatePathStr); + } + + var promises = []; + + if (isDir) { + // folder was passed in, so get all html files for folders + fs.readdirSync(srcPath).forEach(file => { + var curFile = path.join(srcPath, file); + + if (file.endsWith('.html')) { + promises.push(convertMediumFile(curFile, outputFolder, templatePath, export_drafts)); + // } else { + // promises.push(Promise.resolve('not html file')); // FIXME: is this needed? + } + }); + } else { + var promises = [ + convertMediumFile(path.resolve(srcPath), outputFolder, templatePath, export_drafts), + ]; + } + + try { + var result = await Promise.all(promises); + // console.log('ALL DONE', report); + printPrettyReport(); + saveReportToFile(outputFolder); + console.log( + `Medium files from "${path.resolve( + srcPath, + )}" have finished converting to "${path.resolve( + outputFolder, + )}" using the "${templatePathStr}" template.`, + ); + console.log( + `Detailed output report named "conversion_report.json" can be found in the output folder.`, + ); + } catch (e) { + console.log('Error during conversion!', e); + } + }, +}; diff --git a/lib/get-feed.js b/lib/get-feed.js deleted file mode 100644 index 91ebc13..0000000 --- a/lib/get-feed.js +++ /dev/null @@ -1,23 +0,0 @@ -const rssParser = require('rss-parser') -const parser = new rssParser() -const fs = require('fs') -const path = require('path') -const getPost = require('./get-post') - -module.exports = async function(feedURL, program = {}) { - const promises = [] - const data = await parser.parseURL(feedURL) - - data.items.forEach(entry => { - promises.push(getPost(entry.link, program)) - }) - - return Promise.all(promises) - .then(results => { - console.log(results) - return - }) - .catch(err => { - console.log(err) - }) -} diff --git a/lib/get-post.js b/lib/get-post.js deleted file mode 100644 index 9cd3e3d..0000000 --- a/lib/get-post.js +++ /dev/null @@ -1,206 +0,0 @@ -const utils = require('./utils') -const path = require('path') -const fs = require('fs') -const r2 = require('r2') -const slugify = require('underscore.string/slugify') -let options = {} - -function createFolder(path) { - try { - if (!fs.existsSync(path)) { - fs.mkdirSync(path, { recursive: true }) - } - } catch (err) { - // may exist already, ignore - console.error(err) - } - return -} - -module.exports = async function(mediumURL, params = {}) { - options = params - - if (!mediumURL || mediumURL.substr(0, 18) !== 'https://medium.com') { - throw new Error('no url or not a medium.com url') - } - - let output = null - const json = await utils.loadMediumPost(mediumURL, options) - const s = json.payload.value - const story = {} - const images = [] - - story.title = s.title.replace(/:/g, ':') - story.subtitle = s.virtuals.subtitle.trim().replace(/:/g, ':') - story.author = s.displayAuthor - story.date = new Date(s.createdAt).toJSON() - story.slug = s.slug - story.url = s.canonicalUrl - story.images = [] - story.language = s.detectedLanguage - if (s.virtuals.tags) { - story.tags = s.virtuals.tags.map(t => t.slug) - } - if (s.license && s.license !== 0) { - story.license = s.license - } - - // If the author's not available, get it from somewhere else - let authors = [] - if (json.payload.references && json.payload.references.User) { - Object.keys(json.payload.references.User).forEach(k => { - let u = json.payload.references.User[k] - authors.push({ - name: u.name, - username: u.username, - userId: u.userId - }) - }) - story.authors = authors - - if (!story.author) { - story.author = authors[0].name - } - } - - if (s.virtuals.previewImage) { - story.featuredImage = s.virtuals.previewImage.imageId - } - - if (params && params.info) { - process.exit(0) - } - - if (params) { - output = params.output ? params.output : 'content' - } else { - output = process.env.PWD - } - - story.sections = s.content.bodyModel.sections - story.paragraphs = s.content.bodyModel.paragraphs - - const sections = [] - for (let i = 0; i < story.sections.length; i++) { - const s = story.sections[i] - const section = utils.processSection(s, story.slug, images, options) - sections[s.startIndex] = section - } - - if (story.paragraphs.length > 1) { - story.subtitle = story.paragraphs[1].text - } - - story.markdown = [] - if (!options.frontmatter) { - story.markdown.push('\n# ' + story.title.replace(/\n/g, '\n# ')) - if (undefined != story.subtitle) { - story.markdown.push('\n## ' + story.subtitle.replace(/#+/, '')) - } - } - - let lastParagraph = null - story.paragraphs = story.paragraphs.filter((p, idx) => { - if (p.type === 8 && lastParagraph && lastParagraph.type === 8) { - lastParagraph.text += '\n\n' + p.text - return false - } - lastParagraph = p - return true - }) - - const promises = [] - for (let i = 2; i < story.paragraphs.length; i++) { - if (sections[i]) story.markdown.push(sections[i]) - - const promise = new Promise(function(resolve, reject) { - const p = story.paragraphs[i] - - const text = utils.processParagraph(p, story.slug, images, options) - return resolve(text) - }) - promises.push(promise) - } - - return Promise.all(promises) - .then(async results => { - if (!!images.length) { - let featuredImage = story.featuredImage - let outputPath = path.join(output, story.slug, 'images') - if (!!options.jekyll) { - outputPath = path.join(output, `assets/images/${story.slug}`) - } - createFolder(outputPath) - story.images = await utils.downloadImages(images, { - featuredImage: featuredImage, - imageFolder: outputPath - }) - } else { - createFolder(output) - } - - for (let text of results) { - story.markdown.push(text) - } - - if (params && params.debug) { - console.log('debug', story.paragraphs) - } - - // frontmatter - let outputText = '' - if (options.frontmatter) { - outputText = '---\n' - outputText += `slug: ${story.slug}\n` - outputText += `date: ${story.date}\n` - outputText += `author: "${story.author}"\n` - outputText += `title: "${story.title}"\n` - if (story.subtitle) { - outputText += `subtitle: "${story.subtitle}"\n` - } - if (story.images.length > 0) { - outputText += 'images:\n' - for (const image of story.images) { - outputText += ` - ${image}\n` - } - } - if (story.tags.length > 0) { - outputText += 'tags:\n' - for (const tag of story.tags) { - outputText += ` - ${tag}\n` - } - outputText += 'keywords:\n' - for (const tag of story.tags) { - outputText += ` - ${tag}\n` - } - } - outputText += 'draft: true' + '\n' - outputText += '---\n' - } - outputText += story.markdown.join('\n') - - let outputPath = `${output}/${story.slug}.md` - - if (!!options.jekyll) { - outputPath = `${output}/${story.date.slice(0, 10)}-${story.slug}.md` - } - if (output) { - if (!!images.length && !options.jekyll) { - outputPath = path.join(output, story.slug) + '/index.md' - } - fs.writeFileSync(outputPath, outputText) - // return post object if required, else just exit - return options.returnObject ? story : undefined - } else if (!output && params && params.commands) { - console.log(outputText) - return outputText - } else { - return outputText - } - }) - .catch(err => { - console.log('something went wrong') - console.log(err) - return err - }) -} diff --git a/lib/index.js b/lib/index.js new file mode 100644 index 0000000..e2ca528 --- /dev/null +++ b/lib/index.js @@ -0,0 +1,77 @@ +/** + * Module dependencies. + */ +var meow = require('meow'); +var press = require('./press'); + +/** + * Local libs + */ +var converter = require('./converter.js'); + +var cli = meow( + ` + Usage + $ medium2gatsby + + Options + --output, -o Destination folder for output files. Defaults to './'. + --template, -t Template used to generate post files. + --drafts, -d set flag to export drafts along with other posts. Default, false. + --help, -h Shows usage instructions + + Examples + $ medium2gatsby . -o posts -t template.js + $ medium2gatsby 2018-04-02_Introducing-the-react-testing-library----e3a274307e65.html -o output -t template.js + +`, + { + flags: { + drafts: { + type: 'boolean', + alias: 'd', + default: false + }, + output: { + type: 'string', + alias: 'o', + }, + template: { + type: 'string', + alias: 't', + }, + }, + }, +); +/* +{ + input: ['unicorns'], + flags: {rainbow: true}, + ... +} +*/ + +// show help if no args passed +if (cli.input.length < 1) { + cli.showHelp(); +} + +var srcPath = cli.input[0]; +var destPath = cli.flags.output; +var templatePath = cli.flags.template; +var export_drafts = cli.flags.drafts; + +/* ====================================================================================== */ +/* Execution starts from here */ +/* ====================================================================================== */ + +press.print('======================= 💥 Welcome to Medium to md(x) converter 💥 =======================', true, true); + +press.print('💡 Following context has been initialized:'); +press.printItem(`⬇️ INPUT: ${srcPath}`, true); +press.printItem(`⬆️ OUTPUT (-o): ${destPath}`); +press.printItem(`💅 TEMPLATE (-t): ${templatePath}`); +press.printItem(`🚧 SHOULD EXPORT DRAFTS? (-d): ${export_drafts}`); + +converter.convert(srcPath, destPath, templatePath, export_drafts); +// foo(cli.input[0], cli.flags); diff --git a/lib/logger.js b/lib/logger.js new file mode 100644 index 0000000..3db97fa --- /dev/null +++ b/lib/logger.js @@ -0,0 +1,8 @@ +const Logger = { + info: console.log, + debug: console.log, + warn: console.warn, + error: console.error +}; + +module.exports = Logger; diff --git a/lib/markdown.js b/lib/markdown.js new file mode 100644 index 0000000..ea0b293 --- /dev/null +++ b/lib/markdown.js @@ -0,0 +1,175 @@ +// Utils related to scraping pages and converting them to markdown + +// converts html to markdown +var TurndownService = require('turndown'); + +// global placeholder allowing us to pass in options from the template... +var templateOptions; + +var turnDownOptions = { + // linkReferenceStyle: 'collapsed', + codeBlockStyle: 'fenced', +}; +var turndownService = new TurndownService(turnDownOptions); +// strip