import { Entry, RawEntry, TextMatchItem } from "../types";

const StopWordSet = new Set([
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
    'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself',
    'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
    'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be',
    'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an',
    'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by',
    'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before',
    'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over',
    'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
    'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such',
    'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't',
    'can', 'will', 'just', 'don', 'should', 'now'
])

export const getWordsFromStr = (str: string): string[] => {
    const targetStr = str.trim() || "";
    if (Boolean(targetStr)) {
        // TODO: consider case form
        return targetStr.split(";").map(word => word.trim()).filter((tgtWord: string) => Boolean(tgtWord.trim()));
    }
    return [];
}

export const sanitizeWords = (words: string[]): string[] => words.map(word => word.trim()).filter((tgtWord: string) => Boolean(tgtWord.trim()));

// TODO: Since currently all target words for one source word are aggregated together, the specialties often cover most known ones.
// This needs a change in BE to support grouping target words by specialty
// Most target word entries have specialties covering all, so no need to specify industry currently
export const getTgtWords = (targets: RawEntry[]) => {
    const wordSet = targets.reduce((acc: Set<string>, curr: RawEntry) => {
        const currTgtWords = sanitizeWords(curr.target);
        currTgtWords.forEach(word => acc.add(word));

        // const targetStr = curr.target?.trim() || "";
        // if (Boolean(targetStr)) {
        //     targetStr.split(";").forEach((tgtWord: string) => {
        //         // remove empty entries
        //         if (tgtWord.trim().length > 0){
        //         // TODO: consider case form
        //         acc.add(tgtWord.trim());
        //         }
        //     });
        // }
        return acc;
    }, new Set<string>());

    return filterTgtWords([...wordSet]);
}

export const checkIsChineseChar = (charCode: number) => charCode > 10000

export const getMatchScore = (matchParts: TextMatchItem[]) => {
    let totalLength = 0;
    let matchLength = 0;
    matchParts.forEach(part => {
        totalLength += part.text.length;
        matchLength += part.isMatched
            // counting secondary match as 60% weight of the primary
            ? (part.matchScore ?? 0) / 5 * part.text.length
            : 0;
    })
    // TODO: refer to document model for correctly calculate this score
    // give weight to longer match
    // score *= Math.log2(text.length);
    const score = matchLength / totalLength;
    return score;
}

// TODO: Improve the function when necessary
// Get the highest score of all target phrases
export const getSyntheticMatchScore = (textMatchItems: TextMatchItem[]) => {
    const generalScore = textMatchItems.reduce((acc, curr) => {
        const currScore = curr?.matchScore ?? 0;
        acc = currScore > acc ? currScore : acc;
        return acc;
    }, 0);

    return generalScore;
}

export const filterTgtWords = (tgtWords: string[]) => tgtWords
    .map(word => word.trim())
    // TODO: Need to also filter special Chinese characters
    // filter empty words as well as single alphabet or single special characters
    .filter((word) => word.length > 1 || (word.length === 1 && checkIsChineseChar(word.charCodeAt(0))));

export const filterStopWords = (words: string[]) => words.filter((word) => !StopWordSet.has(word))

export const getUnitWords = (tgtWords: string[]) => {
    const unitWordsLowerCased = new Set<string>();
    tgtWords.forEach(word => word.split(' ').forEach(unit => Boolean(unit) && unitWordsLowerCased.add(unit.toLowerCase())));
    return filterStopWords([...unitWordsLowerCased]);
}

export function getKeywordMatchPattern(keywords: string[], isMatchFullWord?: boolean) {
    // Create a regex pattern to match all keywords
    const pattern = keywords
        // replace special characters with its escaped form, e.g. `(test)` => `\(test\)`
        // This ensures that special characters in a keyword are treated as literal characters, not as grouping symbols in a regular expression
        .map(keyword => keyword.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')) // Escape special regex characters
        // full word boundary has to be placed after escaping since it is escaped already
        .map(keyword => {
            keyword = keyword.trim();
            if (isMatchFullWord) {
                keyword = `\\b${keyword}\\b`;
            }
            return keyword
        })
        .join('|');
    return pattern;
}

export const getPrimaryKeywordMatchPattern = (tgtWords: string[]) => {
    return getKeywordMatchPattern(tgtWords, false);
}

export const getSecondaryKeywordMatchPattern = (tgtWords: string[]) => {
    const unitWords = getUnitWords(tgtWords);
    return getKeywordMatchPattern(unitWords, true);
}