File size: 30,721 Bytes
6afedde |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 |
#!/usr/bin/env node
import { readFileSync, writeFileSync, existsSync, mkdirSync, unlinkSync } from 'fs';
import { join, dirname, basename } from 'path';
import { fileURLToPath } from 'url';
import { Client } from '@notionhq/client';
import { NotionConverter } from 'notion-to-md';
import { DefaultExporter } from 'notion-to-md/plugins/exporter';
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
/**
* Ensure directory exists
*/
function ensureDirectory(dir) {
if (!existsSync(dir)) {
mkdirSync(dir, { recursive: true });
}
}
/**
* Post-process Notion-generated Markdown for better MDX compatibility
* @param {string} content - Raw markdown content from Notion
* @param {Client} notionClient - Notion API client (optional)
* @param {string} notionToken - Notion API token (optional)
* @returns {Promise<string>} - Processed markdown content
*/
export async function postProcessMarkdown(content, notionClient = null, notionToken = null) {
console.log('π§ Post-processing Notion Markdown for MDX compatibility...');
let processedContent = content;
// Apply each transformation step
processedContent = removeExcludeTags(processedContent);
processedContent = await includeNotionPages(processedContent, notionClient, notionToken);
processedContent = cleanNotionArtifacts(processedContent);
processedContent = fixImageAltTextWithLinks(processedContent);
processedContent = fixNotionLinks(processedContent);
processedContent = fixJsxAttributes(processedContent);
processedContent = optimizeImages(processedContent);
processedContent = shiftHeadingLevels(processedContent);
processedContent = cleanEmptyLines(processedContent);
processedContent = fixCodeBlocks(processedContent);
processedContent = fixCodeBlockEndings(processedContent);
processedContent = unwrapHtmlCodeBlocks(processedContent);
processedContent = fixPlainTextCodeBlocks(processedContent);
processedContent = optimizeTables(processedContent);
return processedContent;
}
/**
* Remove <exclude> tags and their content, plus associated media files
* @param {string} content - Markdown content
* @returns {string} - Content with exclude tags removed and unused imports cleaned
*/
function removeExcludeTags(content) {
console.log(' ποΈ Removing <exclude> tags and associated media...');
let removedCount = 0;
const removedImageVariables = new Set();
const mediaFilesToDelete = new Set();
// First, extract image variable names and media files from exclude blocks before removing them
const excludeBlocks = content.match(/<exclude>[\s\S]*?<\/exclude>/g) || [];
excludeBlocks.forEach(match => {
// Extract image variables from JSX components
const imageMatches = match.match(/src=\{([^}]+)\}/g);
if (imageMatches) {
imageMatches.forEach(imgMatch => {
const varName = imgMatch.match(/src=\{([^}]+)\}/)?.[1];
if (varName) {
removedImageVariables.add(varName);
}
});
}
// Extract media file paths from markdown images
const markdownImages = match.match(/!\[[^\]]*\]\(([^)]+)\)/g);
if (markdownImages) {
markdownImages.forEach(imgMatch => {
const src = imgMatch.match(/!\[[^\]]*\]\(([^)]+)\)/)?.[1];
if (src) {
// Extract filename from path like /media/pageId/filename.png
const filename = basename(src);
if (filename) {
mediaFilesToDelete.add(filename);
}
}
});
}
});
// Remove <exclude> tags and everything between them (including multiline)
content = content.replace(/<exclude>[\s\S]*?<\/exclude>/g, (match) => {
removedCount++;
return '';
});
// Delete associated media files
if (mediaFilesToDelete.size > 0) {
console.log(` ποΈ Found ${mediaFilesToDelete.size} media file(s) to delete from exclude blocks`);
// Try to find and delete media files in common locations
const possibleMediaDirs = [
join(__dirname, 'output', 'media'),
join(__dirname, '..', '..', 'src', 'content', 'assets', 'image')
];
mediaFilesToDelete.forEach(filename => {
let deleted = false;
for (const mediaDir of possibleMediaDirs) {
if (existsSync(mediaDir)) {
const filePath = join(mediaDir, filename);
if (existsSync(filePath)) {
try {
unlinkSync(filePath);
console.log(` ποΈ Deleted media file: ${filename}`);
deleted = true;
break;
} catch (error) {
console.log(` β οΈ Failed to delete ${filename}: ${error.message}`);
}
}
}
}
if (!deleted) {
console.log(` βΉοΈ Media file not found: ${filename}`);
}
});
}
// Remove unused image imports that were only used in exclude blocks
if (removedImageVariables.size > 0) {
console.log(` πΌοΈ Found ${removedImageVariables.size} unused image import(s) in exclude blocks`);
removedImageVariables.forEach(varName => {
// Check if the variable is still used elsewhere in the content after removing exclude blocks
const remainingUsage = content.includes(`{${varName}}`) || content.includes(`src={${varName}}`);
if (!remainingUsage) {
// Remove import lines for unused image variables
// Pattern: import VarName from './assets/image/filename';
const importPattern = new RegExp(`import\\s+${varName.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}\\s+from\\s+['"][^'"]+['"];?\\s*`, 'g');
content = content.replace(importPattern, '');
console.log(` ποΈ Removed unused import: ${varName}`);
}
});
console.log(` π§Ή Cleaned up unused image imports`);
}
if (removedCount > 0) {
console.log(` β
Removed ${removedCount} <exclude> tag(s) and their content`);
} else {
console.log(' βΉοΈ No <exclude> tags found');
}
return content;
}
/**
* Replace Notion page links with their actual content
* @param {string} content - Markdown content
* @param {Client} notionClient - Notion API client
* @param {string} notionToken - Notion API token
* @returns {Promise<string>} - Content with page links replaced
*/
async function includeNotionPages(content, notionClient, notionToken) {
console.log(' π Including linked Notion pages...');
if (!notionClient || !notionToken) {
console.log(' βΉοΈ Skipping page inclusion (no Notion client/token provided)');
return content;
}
let includedCount = 0;
let skippedCount = 0;
// First, identify all exclude blocks to avoid processing links within them
const excludeBlocks = [];
const excludeRegex = /<exclude>[\s\S]*?<\/exclude>/g;
let excludeMatch;
while ((excludeMatch = excludeRegex.exec(content)) !== null) {
excludeBlocks.push({
start: excludeMatch.index,
end: excludeMatch.index + excludeMatch[0].length
});
}
// Helper function to check if a position is within an exclude block
const isWithinExcludeBlock = (position) => {
return excludeBlocks.some(block => position >= block.start && position <= block.end);
};
// Regex to match links to Notion pages with UUID format
// Pattern: [text](uuid-with-dashes)
const notionPageLinkRegex = /\[([^\]]+)\]\(([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\)/g;
let processedContent = content;
let match;
// Find all matches
const matches = [];
while ((match = notionPageLinkRegex.exec(content)) !== null) {
const linkStartPos = match.index;
// Skip if this link is within an exclude block
if (isWithinExcludeBlock(linkStartPos)) {
console.log(` βοΈ Skipping page link in exclude block: ${match[1]} (${match[2]})`);
skippedCount++;
continue;
}
matches.push({
fullMatch: match[0],
linkText: match[1],
pageId: match[2],
startPos: match.index,
endPos: match.index + match[0].length
});
}
// Process matches in reverse order to maintain correct indices
for (let i = matches.length - 1; i >= 0; i--) {
const link = matches[i];
try {
console.log(` π Fetching content for page: ${link.pageId}`);
// Create media directory for this sub-page
const outputDir = join(__dirname, 'output');
const mediaDir = join(outputDir, 'media', link.pageId);
ensureDirectory(mediaDir);
// Configure the DefaultExporter to get content as string
const exporter = new DefaultExporter({
outputType: 'string',
});
// Create the converter with media downloading strategy (same as convertNotionPage)
const converter = new NotionConverter(notionClient)
.withExporter(exporter)
// Download media to local directory with path transformation
.downloadMediaTo({
outputDir: mediaDir,
// Transform paths to be web-accessible
transformPath: (localPath) => `/media/${link.pageId}/${basename(localPath)}`,
});
// Convert the page
const result = await converter.convert(link.pageId);
console.log(` πΌοΈ Media saved to: ${mediaDir}`);
if (result && result.content) {
// Save raw content as .raw.md file
const rawFileName = `${link.linkText.toLowerCase().replace(/[^a-z0-9]+/g, '-')}-${link.pageId}`;
const rawFilePath = join(outputDir, `${rawFileName}.raw.md`);
try {
writeFileSync(rawFilePath, result.content);
console.log(` π Saved raw markdown: ${rawFileName}.raw.md`);
} catch (error) {
console.log(` β οΈ Failed to save raw file: ${error.message}`);
}
// Clean the content (remove frontmatter, etc.)
let pageContent = result.content;
// Remove YAML frontmatter if present
pageContent = pageContent.replace(/^---[\s\S]*?---\s*\n/, '');
// Remove the first markdown heading (H1, H2, H3, etc.) from the included page
pageContent = pageContent.replace(/^#+ .+\n\n?/, '');
// Keep the page content without title
const finalContent = '\n\n' + pageContent.trim() + '\n\n';
// Replace the link with the content
processedContent = processedContent.substring(0, link.startPos) +
finalContent +
processedContent.substring(link.endPos);
includedCount++;
console.log(` β
Included page content: ${link.linkText}`);
} else {
console.log(` β οΈ No content found for page: ${link.pageId}`);
}
} catch (error) {
console.log(` β Failed to fetch page ${link.pageId}: ${error.message}`);
// Keep the original link if we can't fetch the content
}
}
if (includedCount > 0) {
console.log(` β
Included ${includedCount} Notion page(s)`);
} else {
console.log(' βΉοΈ No Notion page links found to include');
}
if (skippedCount > 0) {
console.log(` βοΈ Skipped ${skippedCount} page link(s) in exclude blocks`);
}
return processedContent;
}
/**
* Clean Notion-specific artifacts and formatting
* @param {string} content - Markdown content
* @returns {string} - Cleaned content
*/
function cleanNotionArtifacts(content) {
console.log(' π§Ή Cleaning Notion artifacts...');
let cleanedCount = 0;
// Remove Notion's internal page references that don't convert well
content = content.replace(/\[([^\]]+)\]\(https:\/\/www\.notion\.so\/[^)]+\)/g, (match, text) => {
cleanedCount++;
return text; // Keep just the text, remove the broken link
});
// Clean up Notion's callout blocks that might not render properly
content = content.replace(/^> \*\*([^*]+)\*\*\s*\n/gm, '> **$1**\n\n');
// Remove Notion's page dividers that don't have markdown equivalents
content = content.replace(/^---+\s*$/gm, '');
// Clean up empty blockquotes
content = content.replace(/^>\s*$/gm, '');
// Fix corrupted bold/italic formatting from notion-to-md conversion
// Pattern: ***text*** **** -> ***text***
content = content.replace(/\*\*\*([^*]+)\*\*\*\s+\*\*\*\*/g, (match, text) => {
cleanedCount++;
return `***${text.trim()}***`;
});
// Fix other corrupted asterisk patterns
// Pattern: **text** ** -> **text**
content = content.replace(/\*\*([^*]+)\*\*\s+\*\*/g, (match, text) => {
cleanedCount++;
return `**${text.trim()}**`;
});
if (cleanedCount > 0) {
console.log(` β
Cleaned ${cleanedCount} Notion artifact(s)`);
}
return content;
}
/**
* Fix image alt text that contains markdown links
* notion-to-md v4 sometimes generates: ](image_path)
* This breaks MDX parsing. Clean it to: 
* @param {string} content - Markdown content
* @returns {string} - Content with fixed image alt text
*/
function fixImageAltTextWithLinks(content) {
console.log(' πΌοΈ Fixing image alt text with embedded links...');
let fixedCount = 0;
// Pattern:  more_text](image_path)
// This regex finds images where the alt text contains markdown links
const imageWithLinksPattern = /!\[([^\]]*\[[^\]]+\]\([^)]+\)[^\]]*)\]\(([^)]+)\)/g;
content = content.replace(imageWithLinksPattern, (match, altText, imagePath) => {
fixedCount++;
// Remove all markdown links from alt text: [text](url) -> text
const cleanedAlt = altText.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1');
// Also clean up any remaining brackets
const finalAlt = cleanedAlt.replace(/[\[\]]/g, '');
console.log(` π§ Fixed: "${altText.substring(0, 50)}..." -> "${finalAlt.substring(0, 50)}..."`);
return ``;
});
if (fixedCount > 0) {
console.log(` β
Fixed ${fixedCount} image(s) with embedded links in alt text`);
} else {
console.log(' βΉοΈ No images with embedded links found');
}
return content;
}
/**
* Fix Notion internal links to be more MDX-friendly
* @param {string} content - Markdown content
* @returns {string} - Content with fixed links
*/
function fixNotionLinks(content) {
console.log(' π Fixing Notion internal links...');
let fixedCount = 0;
// Convert Notion page links to relative links (assuming they'll be converted to MDX)
content = content.replace(/\[([^\]]+)\]\(https:\/\/www\.notion\.so\/[^/]+\/([^?#)]+)\)/g, (match, text, pageId) => {
fixedCount++;
// Convert to relative link - this will need to be updated based on your routing
return `[${text}](#${pageId})`;
});
// Fix broken notion.so links that might be malformed
content = content.replace(/\[([^\]]+)\]\(https:\/\/www\.notion\.so\/[^)]*\)/g, (match, text) => {
fixedCount++;
return text; // Remove broken links, keep text
});
if (fixedCount > 0) {
console.log(` β
Fixed ${fixedCount} Notion link(s)`);
}
return content;
}
/**
* Fix JSX attributes that were corrupted during Notion conversion
* @param {string} content - Markdown content
* @returns {string} - Content with fixed JSX attributes
*/
function fixJsxAttributes(content) {
console.log(' π§ Fixing JSX attributes corrupted by Notion conversion...');
let fixedCount = 0;
// Fix the specific issue: <HtmlEmbed *src* ="/path" /> β <HtmlEmbed src="/path" />
// Pattern: <TagName *attribute* ="value" />
content = content.replace(/<(\w+)\s+\*\s*([^*\s]+)\s*\*\s*=\s*"([^"]*)"\s*\/?>/g, (match, tagName, attribute, value) => {
fixedCount++;
return `<${tagName} ${attribute}="${value}" />`;
});
// Pattern: <TagName *attribute* =value />
content = content.replace(/<(\w+)\s+\*\s*([^*\s]+)\s*\*\s*=\s*([^>\s\/]+)\s*\/?>/g, (match, tagName, attribute, value) => {
fixedCount++;
return `<${tagName} ${attribute}=${value} />`;
});
// Handle cases with **double asterisks** around attribute names
content = content.replace(/<(\w+)\s+\*\*\s*([^*\s]+)\s*\*\*\s*=\s*"([^"]*)"\s*\/?>/g, (match, tagName, attribute, value) => {
fixedCount++;
return `<${tagName} ${attribute}="${value}" />`;
});
content = content.replace(/<(\w+)\s+\*\*\s*([^*\s]+)\s*\*\*\s*=\s*([^>\s\/]+)\s*\/?>/g, (match, tagName, attribute, value) => {
fixedCount++;
return `<${tagName} ${attribute}=${value} />`;
});
// Fix HTML tags (like iframe, video, etc.) where URLs were corrupted by markdown conversion
// Pattern: src="[url](url)" -> src="url"
// Handle both regular quotes and various smart quote characters (", ", ', ', """, etc.)
// Handle attributes before and after src
// Handle iframe tags with separate opening and closing tags FIRST: <iframe ... src="[url](url)" ...>...</iframe>
content = content.replace(/<iframe([^>]*?)\ssrc=[""''""\u201C\u201D\u2018\u2019]\[([^\]]+)\]\([^)]+\)[""''""\u201C\u201D\u2018\u2019]([^>]*?)>\s*<\/iframe>/gi, (match, before, urlText, after) => {
fixedCount++;
return `<iframe${before} src="${urlText}"${after}></iframe>`;
});
// Handle self-closing iframe tags SECOND: <iframe ... src="[url](url)" ... />
content = content.replace(/<iframe([^>]*?)\ssrc=[""''""\u201C\u201D\u2018\u2019]\[([^\]]+)\]\([^)]+\)[""''""\u201C\u201D\u2018\u2019]([^>]*?)\s*\/?>/gi, (match, before, urlText, after) => {
fixedCount++;
return `<iframe${before} src="${urlText}"${after} />`;
});
// Handle other HTML tags with separate opening and closing tags FIRST: <video ... src="[url](url)" ...>...</video>
content = content.replace(/<(video|audio|embed|object)([^>]*?)\ssrc=[""''""\u201C\u201D\u2018\u2019]\[([^\]]+)\]\([^)]+\)[""''""\u201C\u201D\u2018\u2019]([^>]*?)>\s*<\/\1>/gi, (match, tagName, before, urlText, after) => {
fixedCount++;
return `<${tagName}${before} src="${urlText}"${after}></${tagName}>`;
});
// Handle other HTML tags with the same pattern (self-closing) SECOND: <video ... src="[url](url)" ... />
content = content.replace(/<(video|audio|embed|object)([^>]*?)\ssrc=[""''""\u201C\u201D\u2018\u2019]\[([^\]]+)\]\([^)]+\)[""''""\u201C\u201D\u2018\u2019]([^>]*?)\s*\/?>/gi, (match, tagName, before, urlText, after) => {
fixedCount++;
return `<${tagName}${before} src="${urlText}"${after} />`;
});
if (fixedCount > 0) {
console.log(` β
Fixed ${fixedCount} corrupted JSX attribute(s)`);
}
return content;
}
/**
* Optimize images for better MDX compatibility
* @param {string} content - Markdown content
* @returns {string} - Content with optimized images
*/
function optimizeImages(content) {
console.log(' πΌοΈ Optimizing images...');
let optimizedCount = 0;
// Ensure images have proper alt text
content = content.replace(/!\[\]\(([^)]+)\)/g, (match, src) => {
optimizedCount++;
const filename = basename(src);
return ``;
});
// Clean up image paths that might have query parameters
content = content.replace(/!\[([^\]]*)\]\(([^)]+)\?[^)]*\)/g, (match, alt, src) => {
optimizedCount++;
return ``;
});
if (optimizedCount > 0) {
console.log(` β
Optimized ${optimizedCount} image(s)`);
}
return content;
}
/**
* Shift all heading levels down by one (H1 β H2, H2 β H3, etc.)
* @param {string} content - Markdown content
* @returns {string} - Content with shifted heading levels
*/
function shiftHeadingLevels(content) {
console.log(' π Shifting heading levels down by one...');
let shiftedCount = 0;
// Shift heading levels: H1 β H2, H2 β H3, H3 β H4, H4 β H5, H5 β H6
// Process from highest to lowest to avoid conflicts
content = content.replace(/^##### (.*$)/gim, '###### $1');
content = content.replace(/^#### (.*$)/gim, '##### $1');
content = content.replace(/^### (.*$)/gim, '#### $1');
content = content.replace(/^## (.*$)/gim, '### $1');
content = content.replace(/^# (.*$)/gim, '## $1');
// Count the number of headings shifted
const headingMatches = content.match(/^#{1,6} /gm);
if (headingMatches) {
shiftedCount = headingMatches.length;
}
console.log(` β
Shifted ${shiftedCount} heading level(s)`);
return content;
}
/**
* Fix code block endings that end with "text" instead of proper closing
* @param {string} content - Markdown content
* @returns {string} - Content with fixed code block endings
*/
function fixCodeBlockEndings(content) {
console.log(' π» Fixing code block endings...');
let fixedCount = 0;
// Fix code blocks that end with ```text instead of ```
content = content.replace(/```text\n/g, '```\n');
// Count the number of fixes
const textEndingMatches = content.match(/```text\n/g);
if (textEndingMatches) {
fixedCount = textEndingMatches.length;
}
if (fixedCount > 0) {
console.log(` β
Fixed ${fixedCount} code block ending(s)`);
}
return content;
}
/**
* Clean up excessive empty lines
* @param {string} content - Markdown content
* @returns {string} - Content with cleaned spacing
*/
function cleanEmptyLines(content) {
console.log(' π Cleaning excessive empty lines...');
// Only replace 4+ consecutive newlines with 2 newlines (be more conservative)
// This preserves single empty lines between paragraphs which are important for readability
const cleanedContent = content.replace(/\n{4,}/g, '\n\n');
const originalLines = content.split('\n').length;
const cleanedLines = cleanedContent.split('\n').length;
const removedLines = originalLines - cleanedLines;
if (removedLines > 0) {
console.log(` β
Removed ${removedLines} excessive empty line(s)`);
}
return cleanedContent;
}
/**
* Fix code blocks for better MDX compatibility
* @param {string} content - Markdown content
* @returns {string} - Content with fixed code blocks
*/
function fixCodeBlocks(content) {
console.log(' π» Fixing code blocks...');
let fixedCount = 0;
// Ensure code blocks have proper language identifiers
content = content.replace(/^```\s*$/gm, '```text');
// Fix code blocks that might have Notion-specific formatting
content = content.replace(/^```(\w+)\s*\n([\s\S]*?)\n```$/gm, (match, lang, code) => {
// Clean up any Notion artifacts in code
const cleanCode = code.replace(/\u00A0/g, ' '); // Replace non-breaking spaces
return `\`\`\`${lang}\n${cleanCode}\n\`\`\``;
});
if (fixedCount > 0) {
console.log(` β
Fixed ${fixedCount} code block(s)`);
}
return content;
}
/**
* Optimize tables for better MDX rendering
* @param {string} content - Markdown content
* @returns {string} - Content with optimized tables
*/
function optimizeTables(content) {
console.log(' π Optimizing tables...');
let optimizedCount = 0;
// Fix tables that might have inconsistent column counts
content = content.replace(/^\|(.+)\|\s*$/gm, (match, row) => {
const cells = row.split('|').map(cell => cell.trim());
const cleanCells = cells.filter(cell => cell.length > 0);
if (cleanCells.length > 0) {
optimizedCount++;
return `| ${cleanCells.join(' | ')} |`;
}
return match;
});
// Ensure table headers are properly formatted
content = content.replace(/^\|(.+)\|\s*\n\|([-:\s|]+)\|\s*$/gm, (match, header, separator) => {
const headerCells = header.split('|').map(cell => cell.trim()).filter(cell => cell.length > 0);
const separatorCells = separator.split('|').map(cell => cell.trim()).filter(cell => cell.length > 0);
if (headerCells.length !== separatorCells.length) {
optimizedCount++;
const newSeparator = headerCells.map(() => '---').join(' | ');
return `| ${headerCells.join(' | ')} |\n| ${newSeparator} |`;
}
return match;
});
if (optimizedCount > 0) {
console.log(` β
Optimized ${optimizedCount} table(s)`);
}
return content;
}
/**
* Unwrap HTML code blocks to allow direct HTML integration in MDX
* @param {string} content - Markdown content
* @returns {string} - Content with unwrapped HTML code blocks
*/
function unwrapHtmlCodeBlocks(content) {
console.log(' π§ Unwrapping HTML code blocks for MDX integration...');
let unwrappedCount = 0;
// Pattern to match ```html ... ``` blocks
// This regex captures the entire code block including the ```html and ``` markers
const htmlCodeBlockRegex = /```html\s*\n([\s\S]*?)\n```/g;
content = content.replace(htmlCodeBlockRegex, (match, htmlContent) => {
unwrappedCount++;
// Clean up the HTML content - remove leading/trailing whitespace
const cleanHtmlContent = htmlContent.trim();
console.log(` π§ Unwrapped HTML code block (${cleanHtmlContent.length} chars)`);
// Return the HTML content without the code block wrapper
return cleanHtmlContent;
});
if (unwrappedCount > 0) {
console.log(` β
Unwrapped ${unwrappedCount} HTML code block(s) for MDX integration`);
} else {
console.log(' βΉοΈ No HTML code blocks found to unwrap');
}
return content;
}
/**
* Fix plain text code blocks by removing the "plain text" language identifier
* @param {string} content - Markdown content
* @returns {string} - Content with fixed plain text code blocks
*/
function fixPlainTextCodeBlocks(content) {
console.log(' π§ Fixing plain text code blocks...');
let fixedCount = 0;
// Pattern to match ```plain text ... ``` blocks and convert them to ``` ... ```
const plainTextCodeBlockRegex = /```plain text\s*\n([\s\S]*?)\n```/g;
content = content.replace(plainTextCodeBlockRegex, (match, codeContent) => {
fixedCount++;
console.log(` π§ Fixed plain text code block (${codeContent.length} chars)`);
// Return the code block without the "plain text" language identifier
return `\`\`\`\n${codeContent}\n\`\`\``;
});
if (fixedCount > 0) {
console.log(` β
Fixed ${fixedCount} plain text code block(s)`);
} else {
console.log(' βΉοΈ No plain text code blocks found to fix');
}
return content;
}
/**
* Extract frontmatter from Notion page properties
* @param {Object} pageProperties - Notion page properties
* @returns {string} - YAML frontmatter
*/
export function generateFrontmatter(pageProperties) {
console.log(' π Generating frontmatter from Notion properties...');
const frontmatter = {
title: pageProperties.title || 'Untitled',
published: new Date().toISOString().split('T')[0],
tableOfContentsAutoCollapse: true
};
// Add other properties if they exist
if (pageProperties.description) {
frontmatter.description = pageProperties.description;
}
if (pageProperties.tags) {
frontmatter.tags = pageProperties.tags;
}
if (pageProperties.author) {
frontmatter.author = pageProperties.author;
}
// Convert to YAML string
const yamlLines = Object.entries(frontmatter)
.map(([key, value]) => {
if (Array.isArray(value)) {
return `${key}:\n${value.map(v => ` - ${v}`).join('\n')}`;
}
return `${key}: "${value}"`;
});
return `---\n${yamlLines.join('\n')}\n---\n\n`;
}
function main() {
const args = process.argv.slice(2);
if (args.includes('--help') || args.includes('-h')) {
console.log(`
π§ Notion Markdown Post-Processor
Usage:
node post-processor.mjs [options] [input-file] [output-file]
Options:
--verbose Show detailed processing information
--help, -h Show this help
Examples:
# Process a single file
node post-processor.mjs input.md output.md
# Process with verbose output
node post-processor.mjs --verbose input.md output.md
`);
process.exit(0);
}
const verbose = args.includes('--verbose');
const inputFile = args.find(arg => !arg.startsWith('--') && arg.endsWith('.md'));
const outputFile = args.find(arg => !arg.startsWith('--') && arg !== inputFile && arg.endsWith('.md'));
if (!inputFile) {
console.error('β Please provide an input markdown file');
process.exit(1);
}
if (!existsSync(inputFile)) {
console.error(`β Input file not found: ${inputFile}`);
process.exit(1);
}
try {
console.log(`π Reading: ${inputFile}`);
const content = readFileSync(inputFile, 'utf8');
const processedContent = postProcessMarkdown(content);
const finalOutputFile = outputFile || inputFile.replace('.md', '.processed.md');
writeFileSync(finalOutputFile, processedContent);
console.log(`β
Processed: ${finalOutputFile}`);
if (verbose) {
console.log(`π Input: ${content.length} chars β Output: ${processedContent.length} chars`);
}
} catch (error) {
console.error('β Processing failed:', error.message);
process.exit(1);
}
}
// Run CLI if called directly
if (import.meta.url === `file://${process.argv[1]}`) {
main();
}
|