export const NGRAM_LENGTH = 3; // trigrams for now may need to reduce if the matching is too strict
export interface IParagraph {
  text: string;
  index: number;
  nGrams: string[];
  sectionHeading?: string;
}

export interface RankedParagraph {
  paragraph: IParagraph;
  score: number;
}
export function generateNGrams(text: string, n: number): string[] {
  const words = text.toLowerCase().split(/\s+/);
  const ngrams: string[] = [];

  for (let i = 0; i <= words.length - n; i++) {
    const ngram = words.slice(i, i + n).join(' ');
    ngrams.push(ngram);
  }

  return ngrams;
}

export function searchAndRankParagraphs(
  query: string,
  paragraphs: IParagraph[],
  n: number
): RankedParagraph[] {
  // Generate N-grams for the query
  const queryNGrams = generateNGrams(query, n);

  // Calculate similarity scores between query and paragraphs
  const rankedParagraphs: RankedParagraph[] = [];
  paragraphs.forEach((paragraph, index) => {
    if (paragraph.nGrams.length === 0) {
      return;
    }

    // Calculate intersection of N-grams between query and paragraph
    const intersection = queryNGrams.filter(ngram =>
      paragraph.nGrams.includes(ngram)
    );
    // Calculate Jaccard similarity coefficient
    let similarityScore: number;
    const unionSize =
      queryNGrams.length + paragraph.nGrams.length - intersection.length;
    if (unionSize === 0) {
      similarityScore = 0; // Avoid division by zero
    } else {
      similarityScore = intersection.length / unionSize;
    }
    rankedParagraphs.push({
      paragraph: paragraphs[index],
      score: similarityScore,
    });
  });

  // Sort paragraphs by similarity score
  rankedParagraphs.sort((a, b) => b.score - a.score);

  return rankedParagraphs;
}

export function getMatchingParagraph(
  paragraphQuery: string,
  paragraphs: IParagraph[]
): IParagraph | undefined {
  console.log(paragraphQuery, paragraphs);
  if (paragraphQuery.split(' ').length < NGRAM_LENGTH) {
    //find the first reference
    return paragraphs.find(paragraph =>
      paragraph.text.startsWith(paragraphQuery)
    );
  }

  const rankedParagraphs = searchAndRankParagraphs(
    paragraphQuery,
    paragraphs,
    NGRAM_LENGTH
  );

  if (!rankedParagraphs || rankedParagraphs.length === 0) return undefined;

  const bestMatchedParagraph = rankedParagraphs[0];

  if (bestMatchedParagraph.score === 1) return bestMatchedParagraph.paragraph;
  if (bestMatchedParagraph.score < 0.09) return undefined;

  return bestMatchedParagraph.paragraph;
}
