import { getDocument } from 'pdfjs-dist';
import 'pdfjs-dist/build/pdf.worker.mjs'; // Ensure worker is bundled
import { TextItem } from 'pdfjs-dist/types/src/display/api';
import { Transaction } from '../Transaction';

// Helper function to clean strings.
const clean = (str: string): string => str.replace(/\s+/g, ' ').trim();

// Updated helper function to filter and parse only transaction rows.
const parseTransactionsFromRows = (tokens: TextItem[]): Transaction[] => {
  // Group tokens into row groups based on EOL.
  let rowTokens: TextItem[][] = [];
  let currentRow: TextItem[] = [];
  for (const token of tokens) {
    currentRow.push(token);
    if (token.hasEOL) {
      rowTokens.push(currentRow);
      currentRow = [];
    }
  }

  // Find and keep the header row separately.
  const isHeaderRow = (row: TextItem[]): boolean =>
    row.length > 0 && row[0].str.replace(/\s+/g, '') === 'Date(UTC)';

  let headerRow: TextItem[] | null = null;
  const monthRegex = /^(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\b/;

  for (const row of rowTokens) {
    if (isHeaderRow(row)) {
      headerRow = [...row];
      break;
    }
  }
  if (!headerRow) return [];

  while (rowTokens.length > 0 && !monthRegex.test(rowTokens[0][0].str.trim())) {
    rowTokens.shift();
  }

  // Use the header row to determine column boundaries.
  const headerX = headerRow.map((token) => token.transform[4]).sort((a, b) => a - b);
  const boundaries: number[] = [];
  for (let i = 0; i < headerX.length - 1; i++) {
    boundaries.push((headerX[i] + headerX[i + 1]) / 2);
  }
  boundaries.push(Infinity);

  // For each transaction row, group tokens into columns based on token.transform[4] and boundaries.
  const transactions: Transaction[] = [];
  let lastDate = '';
  for (const row of rowTokens) {
    const sorted = row.slice().sort((a, b) => a.transform[4] - b.transform[4]);
    const numCols = headerX.length;
    const columns: string[] = new Array(numCols).fill('');
    for (const token of sorted) {
      const x = token.transform[4];
      let colIndex = 0;
      while (colIndex < boundaries.length && x > boundaries[colIndex]) {
        colIndex++;
      }
      columns[colIndex] += token.str + ' ';
    }
    const dateVal = clean(columns[0]) || lastDate;
    // Remove non-breaking spaces and any stray whitespace characters.
    const amountStr = clean(columns[7])
      .replace(/[$,]/g, '')
      .replace(/\u00A0/g, '')
      .replace(/—|\u2013|\u2012|\u2212/g, '-'); // replace various Unicode dashes with normal negative sign

    console.log({
      amountStr: amountStr,
      parsed: parseFloat(amountStr),
    });
    const amountVal = parseFloat(amountStr); // convert cleaned amount to number
    transactions.push({
      date: dateVal, // first column as date
      description:
        clean(columns[2]) +
        (clean(columns[3]) ? ' ' + clean(columns[3]) : '') +
        ' | ' +
        clean(columns[6]), // combine columns for description
      amount: Math.abs(amountVal), //Change the sign of the amount, as plaid expects so.
      direction: amountVal > 0 ? 'CREDIT' : 'DEBIT',
    } as Transaction);
    lastDate = dateVal;
  }
  return transactions;
};
const transform = (transactions: Transaction[], year: number) => {
  return transactions.map((transaction) => {
    return {
      ...transaction,
      date: transaction.date + ', ' + year,
    };
  });
};
const processMercuryFile = async (files: File[], year: number): Promise<Transaction[]> => {
  const extractTransactions = (file: File): Promise<Transaction[]> =>
    new Promise<Transaction[]>((resolve, reject) => {
      const reader = new FileReader();
      reader.onload = async () => {
        const typedArray = new Uint8Array(reader.result as ArrayBuffer);
        const pdf = await getDocument({ data: typedArray }).promise;

        let allPageTransactions: Transaction[] = [];
        for (let i = 1; i <= pdf.numPages; i++) {
          const page = await pdf.getPage(i);
          const textContent = await page.getTextContent();
          const items = textContent.items as TextItem[];
          // Pass the token array directly.
          const pageTransactions = parseTransactionsFromRows(items);

          allPageTransactions.push(...pageTransactions);
        }
        resolve(allPageTransactions);
      };
      reader.onerror = reject;
      reader.readAsArrayBuffer(file);
    });

  const allTransactions: Transaction[] = [];
  for (const file of files) {
    const transactions = await extractTransactions(file);
    allTransactions.push(...transactions);
  }

  return transform(allTransactions, year);
};

export default processMercuryFile;
