forensic-pathways/src/utils/markdown.ts

// src/utils/markdown.ts
// Simple markdown parser for client-side preview functionality
// Note: For production, consider using a proper markdown library like marked or markdown-it

export interface MarkdownParseOptions {
  sanitize?: boolean;
  breaks?: boolean;
  linkTarget?: string;
}

export class SimpleMarkdownParser {
  private options: MarkdownParseOptions;

  constructor(options: MarkdownParseOptions = {}) {
    this.options = {
      sanitize: true,
      breaks: true,
      linkTarget: '_blank',
      ...options
    };
  }

  /**
   * Parse markdown to HTML
   */
  parse(markdown: string): string {
    if (!markdown || markdown.trim().length === 0) {
      return '';
    }

    let html = markdown;

    // Handle code blocks first (to prevent processing content inside them)
    html = this.parseCodeBlocks(html);

    // Parse headers
    html = this.parseHeaders(html);

    // Parse bold and italic
    html = this.parseEmphasis(html);

    // Parse links and images
    html = this.parseLinksAndImages(html);

    // Parse inline code
    html = this.parseInlineCode(html);

    // Parse lists
    html = this.parseLists(html);

    // Parse blockquotes
    html = this.parseBlockquotes(html);

    // Parse horizontal rules
    html = this.parseHorizontalRules(html);

    // Parse line breaks and paragraphs
    html = this.parseLineBreaks(html);

    // Sanitize if needed
    if (this.options.sanitize) {
      html = this.sanitizeHtml(html);
    }

    return html.trim();
  }

  private parseCodeBlocks(html: string): string {
    // Replace code blocks with placeholders to protect them
    const codeBlocks: string[] = [];

    // Match ```code``` blocks
    html = html.replace(/```([\s\S]*?)```/g, (match, code) => {
      const index = codeBlocks.length;
      const lang = code.split('\n')[0].trim();
      const content = code.includes('\n') ? code.substring(code.indexOf('\n') + 1) : code;

      codeBlocks.push(`<pre><code class="language-${this.escapeHtml(lang)}">${this.escapeHtml(content.trim())}</code></pre>`);
      return `__CODEBLOCK_${index}__`;
    });

    // Restore code blocks at the end
    codeBlocks.forEach((block, index) => {
      html = html.replace(`__CODEBLOCK_${index}__`, block);
    });

    return html;
  }

  private parseHeaders(html: string): string {
    // H1-H6 headers
    for (let i = 6; i >= 1; i--) {
      const headerRegex = new RegExp(`^#{${i}}\\s+(.+)$`, 'gm');
      html = html.replace(headerRegex, `<h${i}>$1</h${i}>`);
    }
    return html;
  }

  private parseEmphasis(html: string): string {
    // Bold: **text** or __text__
    html = html.replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>');
    html = html.replace(/__(.*?)__/g, '<strong>$1</strong>');

    // Italic: *text* or _text_
    html = html.replace(/\*(.*?)\*/g, '<em>$1</em>');
    html = html.replace(/_(.*?)_/g, '<em>$1</em>');

    return html;
  }

  private parseLinksAndImages(html: string): string {
    const linkTarget = this.options.linkTarget ? ` target="${this.options.linkTarget}" rel="noopener noreferrer"` : '';

    // Images: ![alt](src)
    html = html.replace(/!\[([^\]]*)\]\(([^)]*)\)/g,
      '<img src="$2" alt="$1" style="max-width: 100%; height: auto; border-radius: 0.25rem; margin: 0.5rem 0;" />');

    // Links: [text](url)
    html = html.replace(/\[([^\]]*)\]\(([^)]*)\)/g,
      `<a href="$2"${linkTarget}>$1</a>`);

    return html;
  }

  private parseInlineCode(html: string): string {
    // Inline code: `code`
    html = html.replace(/`([^`]*)`/g, '<code>$1</code>');
    return html;
  }

  private parseLists(html: string): string {
    // Unordered lists
    html = html.replace(/^[\s]*[-*+]\s+(.+)$/gm, '<li>$1</li>');

    // Ordered lists
    html = html.replace(/^[\s]*\d+\.\s+(.+)$/gm, '<li>$1</li>');

    // Wrap consecutive list items in ul/ol
    html = html.replace(/(<li>.*<\/li>)/s, (match) => {
      // Simple approach: assume unordered list
      return `<ul>${match}</ul>`;
    });

    return html;
  }

  private parseBlockquotes(html: string): string {
    // Blockquotes: > text
    html = html.replace(/^>\s+(.+)$/gm, '<blockquote>$1</blockquote>');

    // Merge consecutive blockquotes
    html = html.replace(/(<\/blockquote>)\s*(<blockquote>)/g, ' ');

    return html;
  }

  private parseHorizontalRules(html: string): string {
    // Horizontal rules: --- or ***
    html = html.replace(/^[-*]{3,}$/gm, '<hr>');
    return html;
  }

  private parseLineBreaks(html: string): string {
    if (!this.options.breaks) {
      return html;
    }

    // Split into paragraphs (double line breaks)
    const paragraphs = html.split(/\n\s*\n/);

    const processedParagraphs = paragraphs.map(paragraph => {
      const trimmed = paragraph.trim();

      // Skip if already wrapped in HTML tag
      if (trimmed.startsWith('<') && trimmed.endsWith('>')) {
        return trimmed;
      }

      // Single line breaks become <br>
      const withBreaks = trimmed.replace(/\n/g, '<br>');

      // Wrap in paragraph if not empty and not already a block element
      if (withBreaks && !this.isBlockElement(withBreaks)) {
        return `<p>${withBreaks}</p>`;
      }

      return withBreaks;
    });

    return processedParagraphs.filter(p => p.trim()).join('\n\n');
  }

  private isBlockElement(html: string): boolean {
    const blockTags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'ul', 'ol', 'li', 'blockquote', 'pre', 'hr'];
    return blockTags.some(tag => html.startsWith(`<${tag}`));
  }

  private sanitizeHtml(html: string): string {
    // Very basic HTML sanitization - for production use a proper library
    const allowedTags = [
      'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
      'p', 'br', 'strong', 'em', 'code', 'pre',
      'a', 'img', 'ul', 'ol', 'li', 'blockquote', 'hr'
    ];

    // Remove script tags and event handlers
    html = html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '');
    html = html.replace(/\bon\w+\s*=\s*"[^"]*"/gi, '');
    html = html.replace(/\bon\w+\s*=\s*'[^']*'/gi, '');
    html = html.replace(/javascript:/gi, '');

    // This is a very basic sanitizer - for production use a proper library like DOMPurify
    return html;
  }

  private escapeHtml(text: string): string {
    const div = document.createElement('div');
    div.textContent = text;
    return div.innerHTML;
  }

  /**
   * Extract plain text from markdown (for word/character counting)
   */
  extractText(markdown: string): string {
    // Remove markdown syntax and return plain text
    let text = markdown;

    // Remove code blocks
    text = text.replace(/```[\s\S]*?```/g, '');

    // Remove inline code
    text = text.replace(/`[^`]*`/g, '');

    // Remove images
    text = text.replace(/!\[[^\]]*\]\([^)]*\)/g, '');

    // Remove links but keep text
    text = text.replace(/\[([^\]]*)\]\([^)]*\)/g, '$1');

    // Remove headers
    text = text.replace(/^#{1,6}\s+/gm, '');

    // Remove emphasis
    text = text.replace(/\*\*(.*?)\*\*/g, '$1');
    text = text.replace(/\*(.*?)\*/g, '$1');
    text = text.replace(/__(.*?)__/g, '$1');
    text = text.replace(/_(.*?)_/g, '$1');

    // Remove blockquotes
    text = text.replace(/^>\s+/gm, '');

    // Remove list markers
    text = text.replace(/^[\s]*[-*+]\s+/gm, '');
    text = text.replace(/^[\s]*\d+\.\s+/gm, '');

    // Remove horizontal rules
    text = text.replace(/^[-*]{3,}$/gm, '');

    // Clean up whitespace
    text = text.replace(/\n+/g, ' ').replace(/\s+/g, ' ').trim();

    return text;
  }

  /**
   * Count words in markdown text
   */
  countWords(markdown: string): number {
    const plainText = this.extractText(markdown);
    if (!plainText.trim()) return 0;
    return plainText.trim().split(/\s+/).length;
  }

  /**
   * Count characters in markdown text
   */
  countCharacters(markdown: string): number {
    return this.extractText(markdown).length;
  }

  /**
   * Generate table of contents from headers
   */
  generateTOC(markdown: string): Array<{level: number, text: string, anchor: string}> {
    const headers: Array<{level: number, text: string, anchor: string}> = [];
    const lines = markdown.split('\n');

    lines.forEach(line => {
      const headerMatch = line.match(/^(#{1,6})\s+(.+)$/);
      if (headerMatch) {
        const level = headerMatch[1].length;
        const text = headerMatch[2].trim();
        const anchor = text.toLowerCase()
          .replace(/[^a-z0-9\s-]/g, '')
          .replace(/\s+/g, '-')
          .replace(/-+/g, '-')
          .replace(/^-|-$/g, '');

        headers.push({ level, text, anchor });
      }
    });

    return headers;
  }
}

// Convenience functions for global use
export function parseMarkdown(markdown: string, options?: MarkdownParseOptions): string {
  const parser = new SimpleMarkdownParser(options);
  return parser.parse(markdown);
}

export function extractTextFromMarkdown(markdown: string): string {
  const parser = new SimpleMarkdownParser();
  return parser.extractText(markdown);
}

export function countWordsInMarkdown(markdown: string): number {
  const parser = new SimpleMarkdownParser();
  return parser.countWords(markdown);
}

export function countCharactersInMarkdown(markdown: string): number {
  const parser = new SimpleMarkdownParser();
  return parser.countCharacters(markdown);
}

export function generateMarkdownTOC(markdown: string): Array<{level: number, text: string, anchor: string}> {
  const parser = new SimpleMarkdownParser();
  return parser.generateTOC(markdown);
}