Refactor use of micromark so token stream is authentic by shimming undefined link reference handling, remove no-longer-necessary parse operation in MD034.

2025-09-22 05:40:48 +02:00 · 2024-10-21 20:56:22 -07:00 · 2024-10-21 20:56:22 -07:00 · 38b4ec0c2f
commit 38b4ec0c2f
parent 1d9bd4725d
6 changed files with 369 additions and 122 deletions
--- a/demo/markdownlint-browser.js
+++ b/demo/markdownlint-browser.js
@ -381,11 +381,23 @@ module.exports.frontMatterHasTitle =
 */
 function getReferenceLinkImageData(tokens) {
  const normalizeReference = (s) => s.toLowerCase().trim().replace(/\s+/g, " ");
+  const references = new Map();
+  const shortcuts = new Map();
+  const addReferenceToDictionary = (token, label, isShortcut) => {
+    const referenceDatum = [
+      token.startLine - 1,
+      token.startColumn - 1,
+      token.text.length
+    ];
+    const reference = normalizeReference(label);
+    const dictionary = isShortcut ? shortcuts : references;
+    const referenceData = dictionary.get(reference) || [];
+    referenceData.push(referenceDatum);
+    dictionary.set(reference, referenceData);
+  };
  const definitions = new Map();
  const definitionLineIndices = [];
  const duplicateDefinitions = [];
-  const references = new Map();
-  const shortcuts = new Map();
  const filteredTokens =
    micromark.filterByTypes(
      tokens,
@ -395,7 +407,9 @@ function getReferenceLinkImageData(tokens) {
        // definitions and definitionLineIndices
        "definitionLabelString", "gfmFootnoteDefinitionLabelString",
        // references and shortcuts
-        "gfmFootnoteCall", "image", "link"
+        "gfmFootnoteCall", "image", "link",
+        // undefined link labels
+        "undefinedReferenceCollapsed", "undefinedReferenceFull", "undefinedReferenceShortcut"
      ]
    );
  for (const token of filteredTokens) {
@ -451,22 +465,20 @@ function getReferenceLinkImageData(tokens) {
          }
          // Track link (handle shortcuts separately due to ambiguity in "text [text] text")
          if (isShortcut || isFullOrCollapsed) {
-            const referenceDatum = [
-              token.startLine - 1,
-              token.startColumn - 1,
-              token.text.length,
-              label.length,
-              (referenceString?.text || "").length
-            ];
-            const reference =
-              normalizeReference(referenceString?.text || label);
-            const dictionary = isShortcut ? shortcuts : references;
-            const referenceData = dictionary.get(reference) || [];
-            referenceData.push(referenceDatum);
-            dictionary.set(reference, referenceData);
+            addReferenceToDictionary(token, referenceString?.text || label, isShortcut);
          }
        }
        break;
+      case "undefinedReferenceCollapsed":
+      case "undefinedReferenceFull":
+      case "undefinedReferenceShortcut":
+        {
+          const undefinedReference = micromark.getDescendantsByType(token, [ "undefinedReference" ])[0];
+          const label = undefinedReference.children.map((t) => t.text).join("");
+          const isShortcut = (token.type === "undefinedReferenceShortcut");
+          addReferenceToDictionary(token, label, isShortcut);
+        }
+        break;
    }
  }
  return {
@ -985,8 +997,12 @@ const micromark = __webpack_require__(/*! markdownlint-micromark */ "markdownlin
 const { isHtmlFlowComment } = __webpack_require__(/*! ./micromark-helpers.cjs */ "../helpers/micromark-helpers.cjs");
 const { flatTokensSymbol, htmlFlowSymbol, newLineRe } = __webpack_require__(/*! ./shared.js */ "../helpers/shared.js");

+/** @typedef {import("markdownlint-micromark").Construct} Construct */
 /** @typedef {import("markdownlint-micromark").Event} Event */
 /** @typedef {import("markdownlint-micromark").ParseOptions} MicromarkParseOptions */
+/** @typedef {import("markdownlint-micromark").State} State */
+/** @typedef {import("markdownlint-micromark").Token} Token */
+/** @typedef {import("markdownlint-micromark").Tokenizer} Tokenizer */
 /** @typedef {import("../lib/markdownlint.js").MicromarkToken} MicromarkToken */

 /**
@ -994,25 +1010,19 @@ const { flatTokensSymbol, htmlFlowSymbol, newLineRe } = __webpack_require__(/*!
 *
 * @typedef {Object} ParseOptions
 * @property {boolean} [freezeTokens] Whether to freeze output Tokens.
- * @property {boolean} [shimReferences] Whether to shim missing references.
 */

 /**
 * Parses a Markdown document and returns Micromark events.
 *
 * @param {string} markdown Markdown document.
- * @param {ParseOptions} [parseOptions] Options.
 * @param {MicromarkParseOptions} [micromarkParseOptions] Options for micromark.
 * @returns {Event[]} Micromark events.
 */
 function getEvents(
  markdown,
-  parseOptions = {},
  micromarkParseOptions = {}
 ) {
-  // Get options
-  const shimReferences = Boolean(parseOptions.shimReferences);
-
  // Customize options object to add useful extensions
  micromarkParseOptions.extensions = micromarkParseOptions.extensions || [];
  micromarkParseOptions.extensions.push(
@ -1023,17 +1033,137 @@ function getEvents(
    micromark.math()
  );

-  // Use micromark to parse document into Events
-  const encoding = undefined;
-  const eol = true;
-  const parseContext = micromark.parse(micromarkParseOptions);
-  if (shimReferences) {
-    // Customize ParseContext to treat all references as defined
-    parseContext.defined.includes = (searchElement) => searchElement.length > 0;
+  // // Shim labelEnd to identify undefined link labels
+  /** @type {Event[][]} */
+  const artificialEventLists = [];
+  /** @type {Construct} */
+  const labelEnd =
+    // @ts-ignore
+    micromark.labelEnd;
+  const tokenizeOriginal = labelEnd.tokenize;
+
+  /** @type {Tokenizer} */
+  function tokenizeShim(effects, okOriginal, nokOriginal) {
+    // eslint-disable-next-line consistent-this, unicorn/no-this-assignment, no-invalid-this
+    const tokenizeContext = this;
+    const events = tokenizeContext.events;
+
+    /** @type {State} */
+    const nokShim = (code) => {
+      // Find start of label (image or link)
+      let indexStart = events.length;
+      while (--indexStart >= 0) {
+        const event = events[indexStart];
+        const [ kind, token ] = event;
+        if (kind === "enter") {
+          const { type } = token;
+          if ((type === "labelImage") || (type === "labelLink")) {
+            // Found it
+            break;
+          }
+        }
+      }
+
+      // If found...
+      if (indexStart >= 0) {
+        // Create artificial enter/exit events and replicate all data/lineEnding events within
+        const eventStart = events[indexStart];
+        const [ , eventStartToken ] = eventStart;
+        const eventEnd = events[events.length - 1];
+        const [ , eventEndToken ] = eventEnd;
+        /** @type {Token} */
+        const undefinedReferenceType = {
+          "type": "undefinedReferenceShortcut",
+          "start": eventStartToken.start,
+          "end": eventEndToken.end
+        };
+        /** @type {Token} */
+        const undefinedReference = {
+          "type": "undefinedReference",
+          "start": eventStartToken.start,
+          "end": eventEndToken.end
+        };
+        const eventsToReplicate = events
+          .slice(indexStart)
+          .filter((event) => {
+            const [ , eventToken ] = event;
+            const { type } = eventToken;
+            return (type === "data") || (type === "lineEnding");
+          });
+
+        // Determine the type of the undefined reference
+        const previousUndefinedEvent = (artificialEventLists.length > 0) && artificialEventLists[artificialEventLists.length - 1][0];
+        const previousUndefinedToken = previousUndefinedEvent && previousUndefinedEvent[1];
+        if (
+          previousUndefinedToken &&
+          (previousUndefinedToken.end.line === undefinedReferenceType.start.line) &&
+          (previousUndefinedToken.end.column === undefinedReferenceType.start.column)
+        ) {
+          // Previous undefined reference event is immediately before this one
+          if (eventsToReplicate.length === 0) {
+            // The pair represent a collapsed reference (ex: [...][])
+            previousUndefinedToken.type = "undefinedReferenceCollapsed";
+            previousUndefinedToken.end = eventEndToken.end;
+          } else {
+            // The pair represent a full reference (ex: [...][...])
+            undefinedReferenceType.type = "undefinedReferenceFull";
+            undefinedReferenceType.start = previousUndefinedToken.start;
+            artificialEventLists.pop();
+          }
+        }
+
+        // Create artificial event list and replicate content
+        const text = eventsToReplicate
+          .filter((event) => event[0] === "enter")
+          .map((event) => tokenizeContext.sliceSerialize(event[1]))
+          .join("")
+          .trim();
+        if ((text.length > 0) && !text.includes("]")) {
+          /** @type {Event[]} */
+          const artificialEvents = [];
+          artificialEvents.push(
+            [ "enter", undefinedReferenceType, tokenizeContext ],
+            [ "enter", undefinedReference, tokenizeContext ]
+          );
+          for (const event of eventsToReplicate) {
+            const [ kind, token ] = event;
+            // Copy token because the current object will get modified by the parser
+            artificialEvents.push([ kind, { ...token }, tokenizeContext ]);
+          }
+          artificialEvents.push(
+            [ "exit", undefinedReference, tokenizeContext ],
+            [ "exit", undefinedReferenceType, tokenizeContext ]
+          );
+          artificialEventLists.push(artificialEvents);
+        }
+      }
+
+      // Continue with original behavior
+      return nokOriginal(code);
+    };
+
+    // Shim nok handler of labelEnd's tokenize
+    return tokenizeOriginal.call(tokenizeContext, effects, okOriginal, nokShim);
+  }
+
+  try {
+    // Shim labelEnd behavior to detect undefined references
+    labelEnd.tokenize = tokenizeShim;
+
+    // Use micromark to parse document into Events
+    const encoding = undefined;
+    const eol = true;
+    const parseContext = micromark.parse(micromarkParseOptions);
+    const chunks = micromark.preprocess()(markdown, encoding, eol);
+    const events = micromark.postprocess(parseContext.document().write(chunks));
+
+    // Append artificial events and return all events
+    // eslint-disable-next-line unicorn/prefer-spread
+    return events.concat(...artificialEventLists);
+  } finally {
+    // Restore shimmed labelEnd behavior
+    labelEnd.tokenize = tokenizeOriginal;
  }
-  const chunks = micromark.preprocess()(markdown, encoding, eol);
-  const events = micromark.postprocess(parseContext.document().write(chunks));
-  return events;
 }

 /**
@ -1057,7 +1187,7 @@ function parseInternal(
  const freezeTokens = Boolean(parseOptions.freezeTokens);

  // Use micromark to parse document into Events
-  const events = getEvents(markdown, parseOptions, micromarkParseOptions);
+  const events = getEvents(markdown, micromarkParseOptions);

  // Create Token objects
  const document = [];
@ -1974,7 +2104,7 @@ function lintContent(
  // Parse content into parser tokens
  const micromarkTokens = micromark.parse(
    content,
-    { "freezeTokens": customRulesPresent, "shimReferences": true }
+    { "freezeTokens": customRulesPresent }
  );
  // Hide the content of HTML comments from rules
  const preClearedContent = content;
@ -5019,8 +5149,6 @@ module.exports = {

 const { addErrorContext } = __webpack_require__(/*! ../helpers */ "../helpers/helpers.js");
 const { filterByPredicate, getHtmlTagInfo, inHtmlFlow } = __webpack_require__(/*! ../helpers/micromark-helpers.cjs */ "../helpers/micromark-helpers.cjs");
-const { parse } = __webpack_require__(/*! ../helpers/micromark-parse.cjs */ "../helpers/micromark-parse.cjs");
-const { filterByTypesCached } = __webpack_require__(/*! ./cache */ "../lib/cache.js");

 // eslint-disable-next-line jsdoc/valid-types
 /** @type import("./markdownlint").Rule */
@ -5054,6 +5182,7 @@ module.exports = {
          return false;
        },
        (token) => {
+          // Ignore content of inline HTML tags
          const { children } = token;
          const result = [];
          for (let i = 0; i < children.length; i++) {
@ -5084,31 +5213,25 @@ module.exports = {
        }
      )
    );
-    const autoLinks = filterByTypesCached([ "literalAutolink" ]);
-    if (autoLinks.length > 0) {
-      // Re-parse with correct link/image reference definition handling
-      const document = params.lines.join("\n");
-      const tokens = parse(document);
-      for (const token of literalAutolinks(tokens)) {
-        const range = [
-          token.startColumn,
-          token.endColumn - token.startColumn
-        ];
-        const fixInfo = {
-          "editColumn": range[0],
-          "deleteCount": range[1],
-          "insertText": `<${token.text}>`
-        };
-        addErrorContext(
-          onError,
-          token.startLine,
-          token.text,
-          undefined,
-          undefined,
-          range,
-          fixInfo
-        );
-      }
+    for (const token of literalAutolinks(params.parsers.micromark.tokens)) {
+      const range = [
+        token.startColumn,
+        token.endColumn - token.startColumn
+      ];
+      const fixInfo = {
+        "editColumn": range[0],
+        "deleteCount": range[1],
+        "insertText": `<${token.text}>`
+      };
+      addErrorContext(
+        onError,
+        token.startLine,
+        token.text,
+        undefined,
+        undefined,
+        range,
+        fixInfo
+      );
    }
  }
 };