Move micromark-parse.mjs from helpers to library, remove all dependencies from helpers.

2026-02-21 12:34:05 +01:00 · 2024-11-30 20:42:14 -08:00 · 2024-11-30 20:42:14 -08:00 · 3599f694ba
commit 3599f694ba
parent 1e71f6f44e
7 changed files with 6 additions and 14 deletions
--- a/lib/markdownlint.mjs
+++ b/lib/markdownlint.mjs
@ -9,7 +9,7 @@ import { promisify } from "node:util";
 import { initialize as cacheInitialize } from "./cache.mjs";
 import { version } from "./constants.mjs";
 import rules from "./rules.mjs";
-import { parse as micromarkParse } from "../helpers/micromark-parse.mjs";
+import { parse as micromarkParse } from "./micromark-parse.mjs";
 import * as helpers from "../helpers/helpers.cjs";

 /**
--- a/lib/md044.mjs
+++ b/lib/md044.mjs
@ -2,7 +2,7 @@

 import { addErrorDetailIf, escapeForRegExp, hasOverlap } from "../helpers/helpers.cjs";
 import { filterByPredicate, filterByTypes } from "../helpers/micromark-helpers.cjs";
-import { parse } from "../helpers/micromark-parse.mjs";
+import { parse } from "./micromark-parse.mjs";

 const ignoredChildTypes = new Set(
  [ "codeFencedFence", "definition", "reference", "resource" ]
--- a/lib/micromark-parse.mjs
+++ b/lib/micromark-parse.mjs
@ -0,0 +1,309 @@
+// @ts-check
+
+import { directive } from "micromark-extension-directive";
+import { gfmAutolinkLiteral } from "micromark-extension-gfm-autolink-literal";
+import { gfmFootnote } from "micromark-extension-gfm-footnote";
+import { gfmTable } from "micromark-extension-gfm-table";
+import { math } from "micromark-extension-math";
+import { parse as micromarkParse, postprocess as micromarkPostprocess, preprocess as micromarkPreprocess } from "micromark";
+// micromark-core-commonmark is not a dependency because this instance must match what's used by micromark
+// eslint-disable-next-line n/no-extraneous-import
+import { labelEnd } from "micromark-core-commonmark";
+import { isHtmlFlowComment } from "../helpers/micromark-helpers.cjs";
+import { flatTokensSymbol, htmlFlowSymbol, newLineRe } from "../helpers/shared.cjs";
+
+/** @typedef {import("micromark-util-types").Event} Event */
+/** @typedef {import("micromark-util-types").ParseOptions} MicromarkParseOptions */
+/** @typedef {import("micromark-util-types").State} State */
+/** @typedef {import("micromark-util-types").Token} Token */
+/** @typedef {import("micromark-util-types").Tokenizer} Tokenizer */
+/** @typedef {import("./micromark-types.d.mts")} */
+/** @typedef {import("../lib/markdownlint.mjs").MicromarkToken} MicromarkToken */
+
+/**
+ * Parse options.
+ *
+ * @typedef {Object} ParseOptions
+ * @property {boolean} [freezeTokens] Whether to freeze output Tokens.
+ */
+
+/**
+ * Parses a Markdown document and returns Micromark events.
+ *
+ * @param {string} markdown Markdown document.
+ * @param {MicromarkParseOptions} [micromarkParseOptions] Options for micromark.
+ * @returns {Event[]} Micromark events.
+ */
+export function getEvents(
+  markdown,
+  micromarkParseOptions = {}
+) {
+  // Customize extensions list to add useful extensions
+  const extensions = [
+    directive(),
+    gfmAutolinkLiteral(),
+    gfmFootnote(),
+    gfmTable(),
+    math(),
+    ...(micromarkParseOptions.extensions || [])
+  ];
+
+  // // Shim labelEnd to identify undefined link labels
+  /** @type {Event[][]} */
+  const artificialEventLists = [];
+  const tokenizeOriginal = labelEnd.tokenize;
+
+  /** @type {Tokenizer} */
+  function tokenizeShim(effects, okOriginal, nokOriginal) {
+    // eslint-disable-next-line consistent-this, unicorn/no-this-assignment, no-invalid-this
+    const tokenizeContext = this;
+    const events = tokenizeContext.events;
+
+    /** @type {State} */
+    const nokShim = (code) => {
+      // Find start of label (image or link)
+      let indexStart = events.length;
+      while (--indexStart >= 0) {
+        const event = events[indexStart];
+        const [ kind, token ] = event;
+        if (kind === "enter") {
+          const { type } = token;
+          if ((type === "labelImage") || (type === "labelLink")) {
+            // Found it
+            break;
+          }
+        }
+      }
+
+      // If found...
+      if (indexStart >= 0) {
+        // Create artificial enter/exit events and replicate all data/lineEnding events within
+        const eventStart = events[indexStart];
+        const [ , eventStartToken ] = eventStart;
+        const eventEnd = events[events.length - 1];
+        const [ , eventEndToken ] = eventEnd;
+        /** @type {Token} */
+        const undefinedReferenceType = {
+          "type": "undefinedReferenceShortcut",
+          "start": eventStartToken.start,
+          "end": eventEndToken.end
+        };
+        /** @type {Token} */
+        const undefinedReference = {
+          "type": "undefinedReference",
+          "start": eventStartToken.start,
+          "end": eventEndToken.end
+        };
+        const eventsToReplicate = events
+          .slice(indexStart)
+          .filter((event) => {
+            const [ , eventToken ] = event;
+            const { type } = eventToken;
+            return (type === "data") || (type === "lineEnding");
+          });
+
+        // Determine the type of the undefined reference
+        const previousUndefinedEvent = (artificialEventLists.length > 0) && artificialEventLists[artificialEventLists.length - 1][0];
+        const previousUndefinedToken = previousUndefinedEvent && previousUndefinedEvent[1];
+        if (
+          previousUndefinedToken &&
+          (previousUndefinedToken.end.line === undefinedReferenceType.start.line) &&
+          (previousUndefinedToken.end.column === undefinedReferenceType.start.column)
+        ) {
+          // Previous undefined reference event is immediately before this one
+          if (eventsToReplicate.length === 0) {
+            // The pair represent a collapsed reference (ex: [...][])
+            previousUndefinedToken.type = "undefinedReferenceCollapsed";
+            previousUndefinedToken.end = eventEndToken.end;
+          } else {
+            // The pair represent a full reference (ex: [...][...])
+            undefinedReferenceType.type = "undefinedReferenceFull";
+            undefinedReferenceType.start = previousUndefinedToken.start;
+            artificialEventLists.pop();
+          }
+        }
+
+        // Create artificial event list and replicate content
+        const text = eventsToReplicate
+          .filter((event) => event[0] === "enter")
+          .map((event) => tokenizeContext.sliceSerialize(event[1]))
+          .join("")
+          .trim();
+        if ((text.length > 0) && !text.includes("]")) {
+          /** @type {Event[]} */
+          const artificialEvents = [];
+          artificialEvents.push(
+            [ "enter", undefinedReferenceType, tokenizeContext ],
+            [ "enter", undefinedReference, tokenizeContext ]
+          );
+          for (const event of eventsToReplicate) {
+            const [ kind, token ] = event;
+            // Copy token because the current object will get modified by the parser
+            artificialEvents.push([ kind, { ...token }, tokenizeContext ]);
+          }
+          artificialEvents.push(
+            [ "exit", undefinedReference, tokenizeContext ],
+            [ "exit", undefinedReferenceType, tokenizeContext ]
+          );
+          artificialEventLists.push(artificialEvents);
+        }
+      }
+
+      // Continue with original behavior
+      return nokOriginal(code);
+    };
+
+    // Shim nok handler of labelEnd's tokenize
+    return tokenizeOriginal.call(tokenizeContext, effects, okOriginal, nokShim);
+  }
+
+  try {
+    // Shim labelEnd behavior to detect undefined references
+    labelEnd.tokenize = tokenizeShim;
+
+    // Use micromark to parse document into Events
+    const encoding = undefined;
+    const eol = true;
+    const parseContext = micromarkParse({ ...micromarkParseOptions, extensions });
+    const chunks = micromarkPreprocess()(markdown, encoding, eol);
+    const events = micromarkPostprocess(parseContext.document().write(chunks));
+
+    // Append artificial events and return all events
+    // eslint-disable-next-line unicorn/prefer-spread
+    return events.concat(...artificialEventLists);
+  } finally {
+    // Restore shimmed labelEnd behavior
+    labelEnd.tokenize = tokenizeOriginal;
+  }
+}
+
+/**
+ * Parses a Markdown document and returns micromark tokens (internal).
+ *
+ * @param {string} markdown Markdown document.
+ * @param {ParseOptions} [parseOptions] Options.
+ * @param {MicromarkParseOptions} [micromarkParseOptions] Options for micromark.
+ * @param {number} [lineDelta] Offset for start/end line.
+ * @param {MicromarkToken} [ancestor] Parent of top-most tokens.
+ * @returns {MicromarkToken[]} Micromark tokens.
+ */
+function parseInternal(
+  markdown,
+  parseOptions = {},
+  micromarkParseOptions = {},
+  lineDelta = 0,
+  ancestor = undefined
+) {
+  // Get options
+  const freezeTokens = Boolean(parseOptions.freezeTokens);
+
+  // Use micromark to parse document into Events
+  const events = getEvents(markdown, micromarkParseOptions);
+
+  // Create Token objects
+  const document = [];
+  let flatTokens = [];
+  /** @type {MicromarkToken} */
+  const root = {
+    "type": "data",
+    "startLine": -1,
+    "startColumn": -1,
+    "endLine": -1,
+    "endColumn": -1,
+    "text": "ROOT",
+    "children": document,
+    "parent": null
+  };
+  const history = [ root ];
+  let current = root;
+  /** @type {MicromarkParseOptions | null} */
+  let reparseOptions = null;
+  let lines = null;
+  let skipHtmlFlowChildren = false;
+  for (const event of events) {
+    const [ kind, token, context ] = event;
+    const { type, start, end } = token;
+    const { "column": startColumn, "line": startLine } = start;
+    const { "column": endColumn, "line": endLine } = end;
+    const text = context.sliceSerialize(token);
+    if ((kind === "enter") && !skipHtmlFlowChildren) {
+      const previous = current;
+      history.push(previous);
+      current = {
+        type,
+        "startLine": startLine + lineDelta,
+        startColumn,
+        "endLine": endLine + lineDelta,
+        endColumn,
+        text,
+        "children": [],
+        "parent": ((previous === root) ? (ancestor || null) : previous)
+      };
+      if (ancestor) {
+        Object.defineProperty(current, htmlFlowSymbol, { "value": true });
+      }
+      previous.children.push(current);
+      flatTokens.push(current);
+      if ((current.type === "htmlFlow") && !isHtmlFlowComment(current)) {
+        skipHtmlFlowChildren = true;
+        if (!reparseOptions || !lines) {
+          reparseOptions = {
+            ...micromarkParseOptions,
+            "extensions": [
+              {
+                "disable": {
+                  "null": [ "codeIndented", "htmlFlow" ]
+                }
+              }
+            ]
+          };
+          lines = markdown.split(newLineRe);
+        }
+        const reparseMarkdown = lines
+          .slice(current.startLine - 1, current.endLine)
+          .join("\n");
+        const tokens = parseInternal(
+          reparseMarkdown,
+          parseOptions,
+          reparseOptions,
+          current.startLine - 1,
+          current
+        );
+        current.children = tokens;
+        // Avoid stack overflow of Array.push(...spread)
+        // eslint-disable-next-line unicorn/prefer-spread
+        flatTokens = flatTokens.concat(tokens[flatTokensSymbol]);
+      }
+    } else if (kind === "exit") {
+      if (type === "htmlFlow") {
+        skipHtmlFlowChildren = false;
+      }
+      if (!skipHtmlFlowChildren) {
+        if (freezeTokens) {
+          Object.freeze(current.children);
+          Object.freeze(current);
+        }
+        // @ts-ignore
+        current = history.pop();
+      }
+    }
+  }
+
+  // Return document
+  Object.defineProperty(document, flatTokensSymbol, { "value": flatTokens });
+  if (freezeTokens) {
+    Object.freeze(document);
+  }
+  return document;
+}
+
+/**
+ * Parses a Markdown document and returns micromark tokens.
+ *
+ * @param {string} markdown Markdown document.
+ * @param {ParseOptions} [parseOptions] Options.
+ * @returns {MicromarkToken[]} Micromark tokens.
+ */
+export function parse(markdown, parseOptions) {
+  return parseInternal(markdown, parseOptions);
+}
--- a/lib/micromark-types.d.mts
+++ b/lib/micromark-types.d.mts
@ -0,0 +1,11 @@
+export {};
+
+// Augment TokenTypeMap with markdownlint-specific types.
+declare module "micromark-util-types" {
+  export interface TokenTypeMap {
+    undefinedReference: "undefinedReference"
+    undefinedReferenceCollapsed: "undefinedReferenceCollapsed"
+    undefinedReferenceFull: "undefinedReferenceFull"
+    undefinedReferenceShortcut: "undefinedReferenceShortcut"
+  }
+}