Refactor use of micromark so token stream is authentic by shimming undefined link reference handling, remove no-longer-necessary parse operation in MD034.

This commit is contained in:
David Anson 2024-10-21 20:56:22 -07:00
parent 1d9bd4725d
commit 38b4ec0c2f
6 changed files with 369 additions and 122 deletions

View file

@ -381,11 +381,23 @@ module.exports.frontMatterHasTitle =
*/
function getReferenceLinkImageData(tokens) {
const normalizeReference = (s) => s.toLowerCase().trim().replace(/\s+/g, " ");
const references = new Map();
const shortcuts = new Map();
const addReferenceToDictionary = (token, label, isShortcut) => {
const referenceDatum = [
token.startLine - 1,
token.startColumn - 1,
token.text.length
];
const reference = normalizeReference(label);
const dictionary = isShortcut ? shortcuts : references;
const referenceData = dictionary.get(reference) || [];
referenceData.push(referenceDatum);
dictionary.set(reference, referenceData);
};
const definitions = new Map();
const definitionLineIndices = [];
const duplicateDefinitions = [];
const references = new Map();
const shortcuts = new Map();
const filteredTokens =
micromark.filterByTypes(
tokens,
@ -395,7 +407,9 @@ function getReferenceLinkImageData(tokens) {
// definitions and definitionLineIndices
"definitionLabelString", "gfmFootnoteDefinitionLabelString",
// references and shortcuts
"gfmFootnoteCall", "image", "link"
"gfmFootnoteCall", "image", "link",
// undefined link labels
"undefinedReferenceCollapsed", "undefinedReferenceFull", "undefinedReferenceShortcut"
]
);
for (const token of filteredTokens) {
@ -451,22 +465,20 @@ function getReferenceLinkImageData(tokens) {
}
// Track link (handle shortcuts separately due to ambiguity in "text [text] text")
if (isShortcut || isFullOrCollapsed) {
const referenceDatum = [
token.startLine - 1,
token.startColumn - 1,
token.text.length,
label.length,
(referenceString?.text || "").length
];
const reference =
normalizeReference(referenceString?.text || label);
const dictionary = isShortcut ? shortcuts : references;
const referenceData = dictionary.get(reference) || [];
referenceData.push(referenceDatum);
dictionary.set(reference, referenceData);
addReferenceToDictionary(token, referenceString?.text || label, isShortcut);
}
}
break;
case "undefinedReferenceCollapsed":
case "undefinedReferenceFull":
case "undefinedReferenceShortcut":
{
const undefinedReference = micromark.getDescendantsByType(token, [ "undefinedReference" ])[0];
const label = undefinedReference.children.map((t) => t.text).join("");
const isShortcut = (token.type === "undefinedReferenceShortcut");
addReferenceToDictionary(token, label, isShortcut);
}
break;
}
}
return {
@ -985,8 +997,12 @@ const micromark = __webpack_require__(/*! markdownlint-micromark */ "markdownlin
const { isHtmlFlowComment } = __webpack_require__(/*! ./micromark-helpers.cjs */ "../helpers/micromark-helpers.cjs");
const { flatTokensSymbol, htmlFlowSymbol, newLineRe } = __webpack_require__(/*! ./shared.js */ "../helpers/shared.js");
/** @typedef {import("markdownlint-micromark").Construct} Construct */
/** @typedef {import("markdownlint-micromark").Event} Event */
/** @typedef {import("markdownlint-micromark").ParseOptions} MicromarkParseOptions */
/** @typedef {import("markdownlint-micromark").State} State */
/** @typedef {import("markdownlint-micromark").Token} Token */
/** @typedef {import("markdownlint-micromark").Tokenizer} Tokenizer */
/** @typedef {import("../lib/markdownlint.js").MicromarkToken} MicromarkToken */
/**
@ -994,25 +1010,19 @@ const { flatTokensSymbol, htmlFlowSymbol, newLineRe } = __webpack_require__(/*!
*
* @typedef {Object} ParseOptions
* @property {boolean} [freezeTokens] Whether to freeze output Tokens.
* @property {boolean} [shimReferences] Whether to shim missing references.
*/
/**
* Parses a Markdown document and returns Micromark events.
*
* @param {string} markdown Markdown document.
* @param {ParseOptions} [parseOptions] Options.
* @param {MicromarkParseOptions} [micromarkParseOptions] Options for micromark.
* @returns {Event[]} Micromark events.
*/
function getEvents(
markdown,
parseOptions = {},
micromarkParseOptions = {}
) {
// Get options
const shimReferences = Boolean(parseOptions.shimReferences);
// Customize options object to add useful extensions
micromarkParseOptions.extensions = micromarkParseOptions.extensions || [];
micromarkParseOptions.extensions.push(
@ -1023,17 +1033,137 @@ function getEvents(
micromark.math()
);
// Use micromark to parse document into Events
const encoding = undefined;
const eol = true;
const parseContext = micromark.parse(micromarkParseOptions);
if (shimReferences) {
// Customize ParseContext to treat all references as defined
parseContext.defined.includes = (searchElement) => searchElement.length > 0;
// // Shim labelEnd to identify undefined link labels
/** @type {Event[][]} */
const artificialEventLists = [];
/** @type {Construct} */
const labelEnd =
// @ts-ignore
micromark.labelEnd;
const tokenizeOriginal = labelEnd.tokenize;
/** @type {Tokenizer} */
function tokenizeShim(effects, okOriginal, nokOriginal) {
// eslint-disable-next-line consistent-this, unicorn/no-this-assignment, no-invalid-this
const tokenizeContext = this;
const events = tokenizeContext.events;
/** @type {State} */
const nokShim = (code) => {
// Find start of label (image or link)
let indexStart = events.length;
while (--indexStart >= 0) {
const event = events[indexStart];
const [ kind, token ] = event;
if (kind === "enter") {
const { type } = token;
if ((type === "labelImage") || (type === "labelLink")) {
// Found it
break;
}
}
}
// If found...
if (indexStart >= 0) {
// Create artificial enter/exit events and replicate all data/lineEnding events within
const eventStart = events[indexStart];
const [ , eventStartToken ] = eventStart;
const eventEnd = events[events.length - 1];
const [ , eventEndToken ] = eventEnd;
/** @type {Token} */
const undefinedReferenceType = {
"type": "undefinedReferenceShortcut",
"start": eventStartToken.start,
"end": eventEndToken.end
};
/** @type {Token} */
const undefinedReference = {
"type": "undefinedReference",
"start": eventStartToken.start,
"end": eventEndToken.end
};
const eventsToReplicate = events
.slice(indexStart)
.filter((event) => {
const [ , eventToken ] = event;
const { type } = eventToken;
return (type === "data") || (type === "lineEnding");
});
// Determine the type of the undefined reference
const previousUndefinedEvent = (artificialEventLists.length > 0) && artificialEventLists[artificialEventLists.length - 1][0];
const previousUndefinedToken = previousUndefinedEvent && previousUndefinedEvent[1];
if (
previousUndefinedToken &&
(previousUndefinedToken.end.line === undefinedReferenceType.start.line) &&
(previousUndefinedToken.end.column === undefinedReferenceType.start.column)
) {
// Previous undefined reference event is immediately before this one
if (eventsToReplicate.length === 0) {
// The pair represent a collapsed reference (ex: [...][])
previousUndefinedToken.type = "undefinedReferenceCollapsed";
previousUndefinedToken.end = eventEndToken.end;
} else {
// The pair represent a full reference (ex: [...][...])
undefinedReferenceType.type = "undefinedReferenceFull";
undefinedReferenceType.start = previousUndefinedToken.start;
artificialEventLists.pop();
}
}
// Create artificial event list and replicate content
const text = eventsToReplicate
.filter((event) => event[0] === "enter")
.map((event) => tokenizeContext.sliceSerialize(event[1]))
.join("")
.trim();
if ((text.length > 0) && !text.includes("]")) {
/** @type {Event[]} */
const artificialEvents = [];
artificialEvents.push(
[ "enter", undefinedReferenceType, tokenizeContext ],
[ "enter", undefinedReference, tokenizeContext ]
);
for (const event of eventsToReplicate) {
const [ kind, token ] = event;
// Copy token because the current object will get modified by the parser
artificialEvents.push([ kind, { ...token }, tokenizeContext ]);
}
artificialEvents.push(
[ "exit", undefinedReference, tokenizeContext ],
[ "exit", undefinedReferenceType, tokenizeContext ]
);
artificialEventLists.push(artificialEvents);
}
}
// Continue with original behavior
return nokOriginal(code);
};
// Shim nok handler of labelEnd's tokenize
return tokenizeOriginal.call(tokenizeContext, effects, okOriginal, nokShim);
}
try {
// Shim labelEnd behavior to detect undefined references
labelEnd.tokenize = tokenizeShim;
// Use micromark to parse document into Events
const encoding = undefined;
const eol = true;
const parseContext = micromark.parse(micromarkParseOptions);
const chunks = micromark.preprocess()(markdown, encoding, eol);
const events = micromark.postprocess(parseContext.document().write(chunks));
// Append artificial events and return all events
// eslint-disable-next-line unicorn/prefer-spread
return events.concat(...artificialEventLists);
} finally {
// Restore shimmed labelEnd behavior
labelEnd.tokenize = tokenizeOriginal;
}
const chunks = micromark.preprocess()(markdown, encoding, eol);
const events = micromark.postprocess(parseContext.document().write(chunks));
return events;
}
/**
@ -1057,7 +1187,7 @@ function parseInternal(
const freezeTokens = Boolean(parseOptions.freezeTokens);
// Use micromark to parse document into Events
const events = getEvents(markdown, parseOptions, micromarkParseOptions);
const events = getEvents(markdown, micromarkParseOptions);
// Create Token objects
const document = [];
@ -1974,7 +2104,7 @@ function lintContent(
// Parse content into parser tokens
const micromarkTokens = micromark.parse(
content,
{ "freezeTokens": customRulesPresent, "shimReferences": true }
{ "freezeTokens": customRulesPresent }
);
// Hide the content of HTML comments from rules
const preClearedContent = content;
@ -5019,8 +5149,6 @@ module.exports = {
const { addErrorContext } = __webpack_require__(/*! ../helpers */ "../helpers/helpers.js");
const { filterByPredicate, getHtmlTagInfo, inHtmlFlow } = __webpack_require__(/*! ../helpers/micromark-helpers.cjs */ "../helpers/micromark-helpers.cjs");
const { parse } = __webpack_require__(/*! ../helpers/micromark-parse.cjs */ "../helpers/micromark-parse.cjs");
const { filterByTypesCached } = __webpack_require__(/*! ./cache */ "../lib/cache.js");
// eslint-disable-next-line jsdoc/valid-types
/** @type import("./markdownlint").Rule */
@ -5054,6 +5182,7 @@ module.exports = {
return false;
},
(token) => {
// Ignore content of inline HTML tags
const { children } = token;
const result = [];
for (let i = 0; i < children.length; i++) {
@ -5084,31 +5213,25 @@ module.exports = {
}
)
);
const autoLinks = filterByTypesCached([ "literalAutolink" ]);
if (autoLinks.length > 0) {
// Re-parse with correct link/image reference definition handling
const document = params.lines.join("\n");
const tokens = parse(document);
for (const token of literalAutolinks(tokens)) {
const range = [
token.startColumn,
token.endColumn - token.startColumn
];
const fixInfo = {
"editColumn": range[0],
"deleteCount": range[1],
"insertText": `<${token.text}>`
};
addErrorContext(
onError,
token.startLine,
token.text,
undefined,
undefined,
range,
fixInfo
);
}
for (const token of literalAutolinks(params.parsers.micromark.tokens)) {
const range = [
token.startColumn,
token.endColumn - token.startColumn
];
const fixInfo = {
"editColumn": range[0],
"deleteCount": range[1],
"insertText": `<${token.text}>`
};
addErrorContext(
onError,
token.startLine,
token.text,
undefined,
undefined,
range,
fixInfo
);
}
}
};

View file

@ -369,11 +369,23 @@ module.exports.frontMatterHasTitle =
*/
function getReferenceLinkImageData(tokens) {
const normalizeReference = (s) => s.toLowerCase().trim().replace(/\s+/g, " ");
const references = new Map();
const shortcuts = new Map();
const addReferenceToDictionary = (token, label, isShortcut) => {
const referenceDatum = [
token.startLine - 1,
token.startColumn - 1,
token.text.length
];
const reference = normalizeReference(label);
const dictionary = isShortcut ? shortcuts : references;
const referenceData = dictionary.get(reference) || [];
referenceData.push(referenceDatum);
dictionary.set(reference, referenceData);
};
const definitions = new Map();
const definitionLineIndices = [];
const duplicateDefinitions = [];
const references = new Map();
const shortcuts = new Map();
const filteredTokens =
micromark.filterByTypes(
tokens,
@ -383,7 +395,9 @@ function getReferenceLinkImageData(tokens) {
// definitions and definitionLineIndices
"definitionLabelString", "gfmFootnoteDefinitionLabelString",
// references and shortcuts
"gfmFootnoteCall", "image", "link"
"gfmFootnoteCall", "image", "link",
// undefined link labels
"undefinedReferenceCollapsed", "undefinedReferenceFull", "undefinedReferenceShortcut"
]
);
for (const token of filteredTokens) {
@ -439,22 +453,20 @@ function getReferenceLinkImageData(tokens) {
}
// Track link (handle shortcuts separately due to ambiguity in "text [text] text")
if (isShortcut || isFullOrCollapsed) {
const referenceDatum = [
token.startLine - 1,
token.startColumn - 1,
token.text.length,
label.length,
(referenceString?.text || "").length
];
const reference =
normalizeReference(referenceString?.text || label);
const dictionary = isShortcut ? shortcuts : references;
const referenceData = dictionary.get(reference) || [];
referenceData.push(referenceDatum);
dictionary.set(reference, referenceData);
addReferenceToDictionary(token, referenceString?.text || label, isShortcut);
}
}
break;
case "undefinedReferenceCollapsed":
case "undefinedReferenceFull":
case "undefinedReferenceShortcut":
{
const undefinedReference = micromark.getDescendantsByType(token, [ "undefinedReference" ])[0];
const label = undefinedReference.children.map((t) => t.text).join("");
const isShortcut = (token.type === "undefinedReferenceShortcut");
addReferenceToDictionary(token, label, isShortcut);
}
break;
}
}
return {

View file

@ -6,8 +6,12 @@ const micromark = require("markdownlint-micromark");
const { isHtmlFlowComment } = require("./micromark-helpers.cjs");
const { flatTokensSymbol, htmlFlowSymbol, newLineRe } = require("./shared.js");
/** @typedef {import("markdownlint-micromark").Construct} Construct */
/** @typedef {import("markdownlint-micromark").Event} Event */
/** @typedef {import("markdownlint-micromark").ParseOptions} MicromarkParseOptions */
/** @typedef {import("markdownlint-micromark").State} State */
/** @typedef {import("markdownlint-micromark").Token} Token */
/** @typedef {import("markdownlint-micromark").Tokenizer} Tokenizer */
/** @typedef {import("../lib/markdownlint.js").MicromarkToken} MicromarkToken */
/**
@ -15,25 +19,19 @@ const { flatTokensSymbol, htmlFlowSymbol, newLineRe } = require("./shared.js");
*
* @typedef {Object} ParseOptions
* @property {boolean} [freezeTokens] Whether to freeze output Tokens.
* @property {boolean} [shimReferences] Whether to shim missing references.
*/
/**
* Parses a Markdown document and returns Micromark events.
*
* @param {string} markdown Markdown document.
* @param {ParseOptions} [parseOptions] Options.
* @param {MicromarkParseOptions} [micromarkParseOptions] Options for micromark.
* @returns {Event[]} Micromark events.
*/
function getEvents(
markdown,
parseOptions = {},
micromarkParseOptions = {}
) {
// Get options
const shimReferences = Boolean(parseOptions.shimReferences);
// Customize options object to add useful extensions
micromarkParseOptions.extensions = micromarkParseOptions.extensions || [];
micromarkParseOptions.extensions.push(
@ -44,17 +42,137 @@ function getEvents(
micromark.math()
);
// Use micromark to parse document into Events
const encoding = undefined;
const eol = true;
const parseContext = micromark.parse(micromarkParseOptions);
if (shimReferences) {
// Customize ParseContext to treat all references as defined
parseContext.defined.includes = (searchElement) => searchElement.length > 0;
// // Shim labelEnd to identify undefined link labels
/** @type {Event[][]} */
const artificialEventLists = [];
/** @type {Construct} */
const labelEnd =
// @ts-ignore
micromark.labelEnd;
const tokenizeOriginal = labelEnd.tokenize;
/** @type {Tokenizer} */
function tokenizeShim(effects, okOriginal, nokOriginal) {
// eslint-disable-next-line consistent-this, unicorn/no-this-assignment, no-invalid-this
const tokenizeContext = this;
const events = tokenizeContext.events;
/** @type {State} */
const nokShim = (code) => {
// Find start of label (image or link)
let indexStart = events.length;
while (--indexStart >= 0) {
const event = events[indexStart];
const [ kind, token ] = event;
if (kind === "enter") {
const { type } = token;
if ((type === "labelImage") || (type === "labelLink")) {
// Found it
break;
}
}
}
// If found...
if (indexStart >= 0) {
// Create artificial enter/exit events and replicate all data/lineEnding events within
const eventStart = events[indexStart];
const [ , eventStartToken ] = eventStart;
const eventEnd = events[events.length - 1];
const [ , eventEndToken ] = eventEnd;
/** @type {Token} */
const undefinedReferenceType = {
"type": "undefinedReferenceShortcut",
"start": eventStartToken.start,
"end": eventEndToken.end
};
/** @type {Token} */
const undefinedReference = {
"type": "undefinedReference",
"start": eventStartToken.start,
"end": eventEndToken.end
};
const eventsToReplicate = events
.slice(indexStart)
.filter((event) => {
const [ , eventToken ] = event;
const { type } = eventToken;
return (type === "data") || (type === "lineEnding");
});
// Determine the type of the undefined reference
const previousUndefinedEvent = (artificialEventLists.length > 0) && artificialEventLists[artificialEventLists.length - 1][0];
const previousUndefinedToken = previousUndefinedEvent && previousUndefinedEvent[1];
if (
previousUndefinedToken &&
(previousUndefinedToken.end.line === undefinedReferenceType.start.line) &&
(previousUndefinedToken.end.column === undefinedReferenceType.start.column)
) {
// Previous undefined reference event is immediately before this one
if (eventsToReplicate.length === 0) {
// The pair represent a collapsed reference (ex: [...][])
previousUndefinedToken.type = "undefinedReferenceCollapsed";
previousUndefinedToken.end = eventEndToken.end;
} else {
// The pair represent a full reference (ex: [...][...])
undefinedReferenceType.type = "undefinedReferenceFull";
undefinedReferenceType.start = previousUndefinedToken.start;
artificialEventLists.pop();
}
}
// Create artificial event list and replicate content
const text = eventsToReplicate
.filter((event) => event[0] === "enter")
.map((event) => tokenizeContext.sliceSerialize(event[1]))
.join("")
.trim();
if ((text.length > 0) && !text.includes("]")) {
/** @type {Event[]} */
const artificialEvents = [];
artificialEvents.push(
[ "enter", undefinedReferenceType, tokenizeContext ],
[ "enter", undefinedReference, tokenizeContext ]
);
for (const event of eventsToReplicate) {
const [ kind, token ] = event;
// Copy token because the current object will get modified by the parser
artificialEvents.push([ kind, { ...token }, tokenizeContext ]);
}
artificialEvents.push(
[ "exit", undefinedReference, tokenizeContext ],
[ "exit", undefinedReferenceType, tokenizeContext ]
);
artificialEventLists.push(artificialEvents);
}
}
// Continue with original behavior
return nokOriginal(code);
};
// Shim nok handler of labelEnd's tokenize
return tokenizeOriginal.call(tokenizeContext, effects, okOriginal, nokShim);
}
try {
// Shim labelEnd behavior to detect undefined references
labelEnd.tokenize = tokenizeShim;
// Use micromark to parse document into Events
const encoding = undefined;
const eol = true;
const parseContext = micromark.parse(micromarkParseOptions);
const chunks = micromark.preprocess()(markdown, encoding, eol);
const events = micromark.postprocess(parseContext.document().write(chunks));
// Append artificial events and return all events
// eslint-disable-next-line unicorn/prefer-spread
return events.concat(...artificialEventLists);
} finally {
// Restore shimmed labelEnd behavior
labelEnd.tokenize = tokenizeOriginal;
}
const chunks = micromark.preprocess()(markdown, encoding, eol);
const events = micromark.postprocess(parseContext.document().write(chunks));
return events;
}
/**
@ -78,7 +196,7 @@ function parseInternal(
const freezeTokens = Boolean(parseOptions.freezeTokens);
// Use micromark to parse document into Events
const events = getEvents(markdown, parseOptions, micromarkParseOptions);
const events = getEvents(markdown, micromarkParseOptions);
// Create Token objects
const document = [];

View file

@ -499,7 +499,7 @@ function lintContent(
// Parse content into parser tokens
const micromarkTokens = micromark.parse(
content,
{ "freezeTokens": customRulesPresent, "shimReferences": true }
{ "freezeTokens": customRulesPresent }
);
// Hide the content of HTML comments from rules
const preClearedContent = content;

View file

@ -4,8 +4,6 @@
const { addErrorContext } = require("../helpers");
const { filterByPredicate, getHtmlTagInfo, inHtmlFlow } = require("../helpers/micromark-helpers.cjs");
const { parse } = require("../helpers/micromark-parse.cjs");
const { filterByTypesCached } = require("./cache");
// eslint-disable-next-line jsdoc/valid-types
/** @type import("./markdownlint").Rule */
@ -39,6 +37,7 @@ module.exports = {
return false;
},
(token) => {
// Ignore content of inline HTML tags
const { children } = token;
const result = [];
for (let i = 0; i < children.length; i++) {
@ -69,31 +68,25 @@ module.exports = {
}
)
);
const autoLinks = filterByTypesCached([ "literalAutolink" ]);
if (autoLinks.length > 0) {
// Re-parse with correct link/image reference definition handling
const document = params.lines.join("\n");
const tokens = parse(document);
for (const token of literalAutolinks(tokens)) {
const range = [
token.startColumn,
token.endColumn - token.startColumn
];
const fixInfo = {
"editColumn": range[0],
"deleteCount": range[1],
"insertText": `<${token.text}>`
};
addErrorContext(
onError,
token.startLine,
token.text,
undefined,
undefined,
range,
fixInfo
);
}
for (const token of literalAutolinks(params.parsers.micromark.tokens)) {
const range = [
token.startColumn,
token.endColumn - token.startColumn
];
const fixInfo = {
"editColumn": range[0],
"deleteCount": range[1],
"insertText": `<${token.text}>`
};
addErrorContext(
onError,
token.startLine,
token.text,
undefined,
undefined,
range,
fixInfo
);
}
}
};

View file

@ -410,6 +410,7 @@ Collapsed reference link: [label][]
Nested empty brackets: [text1[]](https://example.com/)
Missing close bracket, empty text: [text2[](https://example.com/)
Empty bracket pairs: [text3[]][]
Empty bracket pair: [text4[]]
[label]: https://example.com/label
`