import * as assert from "#universal/assert"; import { CJK_REGEXP, PUNCTUATION_REGEXP } from "./constants.evaluate.js"; import { locEnd, locStart } from "./loc.js"; const INLINE_NODE_TYPES = new Set([ "liquidNode", "inlineCode", "emphasis", "esComment", "strong", "delete", "wikiLink", "link", "linkReference", "image", "imageReference", "footnote", "footnoteReference", "sentence", "whitespace", "word", "break", "inlineMath", ]); const INLINE_NODE_WRAPPER_TYPES = new Set([ ...INLINE_NODE_TYPES, "tableCell", "paragraph", "heading", ]); const KIND_NON_CJK = "non-cjk"; const KIND_CJ_LETTER = "cj-letter"; const KIND_K_LETTER = "k-letter"; const KIND_CJK_PUNCTUATION = "cjk-punctuation"; const K_REGEXP = /\p{Script_Extensions=Hangul}/u; /** * @typedef {" " | "\n" | ""} WhitespaceValue * @typedef { KIND_NON_CJK | KIND_CJ_LETTER | KIND_K_LETTER | KIND_CJK_PUNCTUATION } WordKind * @typedef {{ * type: "whitespace", * value: WhitespaceValue, * kind?: never * }} WhitespaceNode * @typedef {{ * type: "word", * value: string, * kind: WordKind, * isCJ: boolean, * hasLeadingPunctuation: boolean, * hasTrailingPunctuation: boolean, * }} WordNode * Node for a single CJK character or a sequence of non-CJK characters * @typedef {WhitespaceNode | WordNode} TextNode */ /** * split text into whitespaces and words * @param {string} text */ function splitText(text) { /** @type {Array} */ const nodes = []; const tokens = text.split(/([\t\n ]+)/u); for (const [index, token] of tokens.entries()) { // whitespace if (index % 2 === 1) { nodes.push({ type: "whitespace", value: /\n/u.test(token) ? "\n" : " ", }); continue; } // word separated by whitespace if ((index === 0 || index === tokens.length - 1) && token === "") { continue; } const innerTokens = token.split(new RegExp(`(${CJK_REGEXP.source})`, "u")); for (const [innerIndex, innerToken] of innerTokens.entries()) { if ( (innerIndex === 0 || innerIndex === innerTokens.length - 1) && innerToken === "" ) { continue; } // non-CJK word if (innerIndex % 2 === 0) { if (innerToken !== "") { appendNode({ type: "word", value: innerToken, kind: KIND_NON_CJK, isCJ: false, hasLeadingPunctuation: PUNCTUATION_REGEXP.test(innerToken[0]), hasTrailingPunctuation: PUNCTUATION_REGEXP.test(innerToken.at(-1)), }); } continue; } // CJK character // punctuation for CJ(K) // Korean doesn't use them in horizontal writing usually if (PUNCTUATION_REGEXP.test(innerToken)) { appendNode({ type: "word", value: innerToken, kind: KIND_CJK_PUNCTUATION, isCJ: true, hasLeadingPunctuation: true, hasTrailingPunctuation: true, }); continue; } // Korean uses space to divide words, but Chinese & Japanese do not // This is why Korean should be treated like non-CJK if (K_REGEXP.test(innerToken)) { appendNode({ type: "word", value: innerToken, kind: KIND_K_LETTER, isCJ: false, hasLeadingPunctuation: false, hasTrailingPunctuation: false, }); continue; } appendNode({ type: "word", value: innerToken, kind: KIND_CJ_LETTER, isCJ: true, hasLeadingPunctuation: false, hasTrailingPunctuation: false, }); } } // Check for `canBeConvertedToSpace` in ./print-whitespace.js etc. if (process.env.NODE_ENV !== "production") { for (let i = 1; i < nodes.length; i++) { assert.ok( !(nodes[i - 1].type === "whitespace" && nodes[i].type === "whitespace"), "splitText should not create consecutive whitespace nodes", ); } } return nodes; function appendNode(node) { const lastNode = nodes.at(-1); if ( lastNode?.type === "word" && !isBetween(KIND_NON_CJK, KIND_CJK_PUNCTUATION) && // disallow leading/trailing full-width whitespace ![lastNode.value, node.value].some((value) => /\u3000/u.test(value)) ) { nodes.push({ type: "whitespace", value: "" }); } nodes.push(node); function isBetween(kind1, kind2) { return ( (lastNode.kind === kind1 && node.kind === kind2) || (lastNode.kind === kind2 && node.kind === kind1) ); } } } function getOrderedListItemInfo(orderListItem, options) { const text = options.originalText.slice( orderListItem.position.start.offset, orderListItem.position.end.offset, ); const m = text.match( /^\s*(?\d+)(\.|\))(?\s*)/u, ); if (!m) { throw new Error( `Failed to parse ordered list item: expected pattern matching /^\\s*(?\\d+)(\\.|\\))(?\\s*)/u, but got: ${JSON.stringify(text)}`, ); } const { numberText, leadingSpaces } = m.groups; return { number: Number(numberText), leadingSpaces }; } function hasGitDiffFriendlyOrderedList(node, options) { if (!node.ordered || node.children.length < 2) { return false; } const secondNumber = getOrderedListItemInfo(node.children[1], options).number; if (secondNumber !== 1) { return false; } const firstNumber = getOrderedListItemInfo(node.children[0], options).number; if (firstNumber !== 0) { return true; } return ( node.children.length > 2 && getOrderedListItemInfo(node.children[2], options).number === 1 ); } // The final new line should not include in value // https://github.com/remarkjs/remark/issues/512 function getFencedCodeBlockValue(node, originalText) { const { value } = node; if ( node.position.end.offset === originalText.length && value.endsWith("\n") && // Code block has no end mark originalText.endsWith("\n") ) { return value.slice(0, -1); } return value; } function mapAst(ast, handler) { return (function preorder(node, index, parentStack) { const newNode = { ...handler(node, index, parentStack) }; if (newNode.children) { newNode.children = newNode.children.map((child, index) => preorder(child, index, [newNode, ...parentStack]), ); } return newNode; })(ast, null, []); } function isAutolink(node) { if (node?.type !== "link" || node.children.length !== 1) { return false; } const [child] = node.children; return locStart(node) === locStart(child) && locEnd(node) === locEnd(child); } export { getFencedCodeBlockValue, getOrderedListItemInfo, hasGitDiffFriendlyOrderedList, INLINE_NODE_TYPES, INLINE_NODE_WRAPPER_TYPES, isAutolink, KIND_CJ_LETTER, KIND_CJK_PUNCTUATION, KIND_K_LETTER, KIND_NON_CJK, mapAst, splitText, };