import { IArticleLang, IRefParts, ISentenceQuote } from '../interfaces/article/article';
import { IGroupedMentions, IMention } from '../interfaces/article/mentions';
import { LANG_KEY_PART, META_LOCATION, SENT_LOCATION } from '../interfaces/article/sentences';
import { getClLogger } from './clLogger';
import { isDuplicatedViaLevenshtein, jaroDistance, jaroWinklerDistance, levenshteinSimilarityPercentage } from './stringUtil';

const clLogger = getClLogger(__filename);

// It is often useful to get and display the title of an article based on some piece
// of metadata extracted within it. This is the predictable key which will fetch the
// first sentence (usually there is only one, but not always) for any content we've ever parsed.
export const ENGLISH_TITLE = 'en_t_000_000';

// Keep delimiters consistent
export const ARTICLE_POSITION_DELIMITER = '/';
export const ARTICLE_KEY_LANG_DELIMITER = '-';
export const PUBLISHER_DELIMITER = '_';
export const LANG_KEY_DELIMITER = '_';
const DEFAULT_LANG = 'en';

export const SENTENCE_TEXT_UNAVAILABLE = 'Quote unavailable.';
const TITLE_TEXT_UNAVAILABLE = 'Title unavailable.';
export const TRANSLATION_SENTENCE_TEXT_UNAVAILABLE = 'Translation Unavailable';

// Accurately convert the long form of the ID of a metadata to only the offset, including
// for the fact that we can have more than one meta at offset X, and will get X-1, X-2, etc.
// i.e. 'VOID-Body-1238213-2' becomes '1238213-2'.
function origMetaIdToBeginIdx(metaId: string): string {
    const numberMatch = /[0-9]+?-?[0-9]?$/.exec(metaId);
    if (Array.isArray(numberMatch) && numberMatch.length) {
        return numberMatch[0];
    }
    return undefined;
}

// Convert a sentence property from Prolog to a unique sentence. We also include the metaId index at the end, to enable the ability
// to precisely highlight one meta inside that sentence
// The number portion of metaId can be formatted in two ways:
//  Trend-TrendP32-Body-591
//  Trend-TrendP32-Body-591-1
export function getSentenceKey(metaId: string, sentence: string, origLang: string, sentenceNA: string): string {
    const beginIdx = origMetaIdToBeginIdx(metaId);
    const wordPart = beginIdx ? `-${beginIdx}` : '';
    // Some language keys include hyphens for locale, such as zh-hk or en-au. Only keep the language part.
    const langKey = origLang ? origLang.split('-')[0] : DEFAULT_LANG;
    const location = getSentLocation(metaId);
    if (sentence && sentence !== sentenceNA) {
        const s = parseSentence(sentence);
        return getLang(langKey, location, s.paragraph, s.sentence, beginIdx, '-');
    }
    return `${langKey}-${location}-000-000${wordPart}`;
}

// Convert the long form in metadata IDs to their corresponding 'sentence' code. -Body- becomes 'b'.
export function getSentLocation(metaId: string): SENT_LOCATION {
    if (metaId.includes(META_LOCATION.BODY) || metaId.includes(META_LOCATION.BODY_ALT)) {
        return SENT_LOCATION.BODY;
    }
    if (metaId.includes(META_LOCATION.TITLE) || metaId.includes(META_LOCATION.TITLE_ALT)) {
        return SENT_LOCATION.TITLE;
    }
    return SENT_LOCATION.GRID;
}

export function getLang(
    lang: string,
    location?: string,
    paragraph?: number | string,
    sentence?: number | string,
    absoluteIndex?: number | string,
    delimiter: string = '_',
) {
    let l: string = '';
    // Make sure that word-numInSentence combo has the correct delimiter
    if (typeof absoluteIndex === 'string') {
        absoluteIndex = absoluteIndex
            .replace(new RegExp(`${ARTICLE_KEY_LANG_DELIMITER}`, 'g'), delimiter)
            .replace(new RegExp(`${LANG_KEY_DELIMITER}`, 'g'), delimiter);
    }

    // Keep null & undefined, but convert any number including 0 to a string to limit logic cases
    paragraph = paragraph == null ? null : paragraph.toString().padStart(3, '0');
    sentence = sentence == null ? null : sentence.toString().padStart(3, '0');
    absoluteIndex = absoluteIndex == null ? null : absoluteIndex.toString();

    if (location && paragraph && sentence && absoluteIndex) {
        l = `${l}${lang}${delimiter}${location}${delimiter}${paragraph}${delimiter}${sentence}${delimiter}${absoluteIndex}`;
    }
    else if (location && paragraph && sentence) {
        l = `${l}${lang}${delimiter}${location}${delimiter}${paragraph}${delimiter}${sentence}`;
    }
    else if (location && paragraph) {
        l = `${l}${lang}${delimiter}${location}${delimiter}${paragraph}${delimiter}`;
    }
    else if (location) {
        l = `${l}${lang}${delimiter}${location}`;
    }
    else {
        l = `${l}${lang}${delimiter}`;
    }
    return l;
}

export function parseSentence(prologSentence: string): { paragraph: number, sentence: number } {
    const parts: string[] = prologSentence.split('-');
    const p = Number(parts[0].substr(1));
    const s = Number(parts[1].substr(1));
    return {
        paragraph: p <= 0 ? 0 : p - 1,
        sentence: s <= 0 ? 0 : s - 1,
    };
}

// Compose a correctly delimited full article reference
// Publisher is optional in case a article+lang only composition is needed
export function getCompositeArticleId(articleKey: string, langKey: string, publisher?: string): string {
    let pubArticleLang = `${articleKey}${ARTICLE_POSITION_DELIMITER}${langKey}`;
    if (publisher) {
        pubArticleLang = `${publisher}${PUBLISHER_DELIMITER}${pubArticleLang}`;
    }
    return pubArticleLang;
}

export function getArticleIdFromKey(key: string) {
    return key.substr(0, key.indexOf(ARTICLE_POSITION_DELIMITER));
}

// Instead of extracting part by part from a full article reference, allow the caller to decompose
//  a result from a single call to all needed parts.
// Also, allow different delimiters depending on reference source
export function getRefParts(ref: string, publisherDelimiter: string = PUBLISHER_DELIMITER): IRefParts {
    // For <publisher><delimiter><article+lang>
    const INDEX_PUBLISHER = 0;
    const INDEX_ARTICLE_REF = 1;
    const INDEX_SAME_DELIMITER_LANG_ID = 2;
    // For <article><delimiter><lang>
    const INDEX_ARTICLE_ID = 0;
    const INDEX_LANG_ID = 1;
    // First, split apart publisher from the rest
    const publisherArticleParts = ref.split(publisherDelimiter);
    const publisher = publisherArticleParts[INDEX_PUBLISHER];
    const articleLang = publisherArticleParts[INDEX_ARTICLE_REF];
    // If the same delimiter is used for publisher and lang, lang will already be split
    let langId = publisherArticleParts[INDEX_SAME_DELIMITER_LANG_ID];
    // Next, split apart article id and lang id
    const articleLangParts = articleLang.split(ARTICLE_POSITION_DELIMITER);
    const articleId = articleLangParts[INDEX_ARTICLE_ID];
    // If a different delimiter was used between publisher and lang, we need to get lang from the article+lang split
    if (!langId) {
        langId = articleLangParts[INDEX_LANG_ID];
    }
    return {
        publisher,
        articleId,
        langId,
    };
}

// From a full key describing article and inter-article location information, extract the needed text coordinate info
// Default to lang-location(body|title)-paragraph-sentence, but allow this to be overridden
export function getLangFromKey(key: string, includeUpTo: LANG_KEY_PART = LANG_KEY_PART.SENTENCE) {
    const lid: string = key.substring(key.indexOf(ARTICLE_POSITION_DELIMITER) + 1);
    return lid.split(ARTICLE_KEY_LANG_DELIMITER, includeUpTo).join(LANG_KEY_DELIMITER);
}

// Returns the unique set of items in an array, using a key function to determine uniqueness.
export function uniqBy<T>(a: T[], key: (t: T) => string) {
    const seen = new Set();
    return a.filter((item) => {
        const k = key(item);
        return seen.has(k) ? false : seen.add(k);
    });
}

export function changeLangToEnglish(lang: string) {
    return 'en' + lang.slice(2);
}

// Dynamo rejects duplicates. This function removes them and shows the duplicates in logs if asked to
export function getDedupedSentenceReferences(listToDedup: IArticleLang[], showDuplicates: boolean): IArticleLang[] {
    const DELIMITER = '/';
    const SINGLE_COUNT = 1;
    const dedupedList = {};
    // Build up counts showing how many times each record is repeated
    const duplicateHistogram = listToDedup.reduce((acc, curr) => {
        const key = `${curr.articleSentenceId}${DELIMITER}${curr.lang}`;
        const newCount = (acc[key] || 0) + SINGLE_COUNT;
        acc[key] = newCount;
        // Build up a deduped list by overwriting the previous value set by the duplicate
        dedupedList[key] = curr;
        return acc;
    }, {});
    if (showDuplicates) {
        const duplicates = Object.keys(duplicateHistogram)
            .filter((key) => duplicateHistogram[key] > SINGLE_COUNT);
        if (duplicates.length) {
            clLogger.debug(`Duplicates being sent to getSentences: ${duplicates}`);
        }
    }
    return Object.values(dedupedList);
}

// Reuse this to clean text from outside content
function cleanSentenceText(sentenceText: string) {
    return sentenceText.trim().replace(/�/g, '');
}

// Help keep title based string literals in one place
function isTitleLang(lang: string) {
    return lang.includes('_t_') || lang.includes('-t-');
}

// Get a article-lang-key to sentence map, with the option to dedup and replace missing text with a specific phrase
export function getNormalizedSentenceMap(sentenceKeys: {articleSentenceId: string, lang: string}[],
                                         sentences: ISentenceQuote[],
                                         noDuplicates: boolean,
                                         setUnavailableText: boolean,
                                         unavailableFn?: (lang: string) => string) {
    const alreadyProcessed = [];
    const duplicated = [];
    const sentencesMap: { [ key: string ]: string } = {};
    // Add keys for all allowed sentences
    for (const sentence of sentences) {
        const sentenceText = cleanSentenceText(sentence.sentenceText);
        const key = sentence.articleSentenceId + sentence.lang;
        // Only dedup non-title sentences
        if (!noDuplicates || (isTitleLang(sentence.lang) || !isDuplicatedViaLevenshtein(sentenceText, alreadyProcessed))) {
            alreadyProcessed.push(sentenceText);
            sentencesMap[key] = sentenceText;
        }
        // If it was a duplicate and we're disallowing dups
        else if (noDuplicates) {
            // if duplicated found store it to ignore for unavailable text
            duplicated.push(key);
        }
    }
    // Deal with missing text if asked to
    if (setUnavailableText) {
        for (const sentenceKey of sentenceKeys) {
            const key = sentenceKey.articleSentenceId + sentenceKey.lang;
            // if not found and not duplicated set unavailable text
            if (!sentencesMap[key] && !duplicated.includes(key)) {
                if (unavailableFn) {
                    sentencesMap[key] = unavailableFn(sentenceKey.lang);
                }
                else {
                    if (isTitleLang(sentenceKey.lang)) {
                        sentencesMap[key] = TITLE_TEXT_UNAVAILABLE;
                    }
                    else {
                        sentencesMap[key] = SENTENCE_TEXT_UNAVAILABLE;
                    }
                }
            }
        }
    }
    return sentencesMap;
}

export function groupMentionsByArticle(mentions: IMention[], date?: string) {
    const matchingMentions = date ? mentions.filter((m) => m.date === date) : mentions;
    const mentionsByGroup: IGroupedMentions = {};
    // Figure out the minimal set of quotes that include the title when the metadata was from body or grid
    const uniqueMentionsAndTitles = matchingMentions.reduce((p, mi) => {
        const id = `${mi.publisher}_${mi.article}`;
        const sent = `en_${mi.field}_${mi.paragraph}_${mi.sentence}`;
        const k1 = `${id}/${sent}`;
        const k2 = `${id}/${ENGLISH_TITLE}`;
        // Build a lookup by combo of id and exact sentence
        if (p[k1] === undefined) {
            p[k1] = k1;
        }
        if (p[k2] === undefined) {
            p[k2] = k2;
        }
        // Also build a separate lookup by id at the same time
        if (mentionsByGroup[id] === undefined) {
            mentionsByGroup[id] = {};
        }
        if (mentionsByGroup[id][sent] === undefined) {
            mentionsByGroup[id][sent] = {
                quote: '',
                mentions: [],
            };
        }
        if (mentionsByGroup[id][ENGLISH_TITLE] === undefined) {
            mentionsByGroup[id][ENGLISH_TITLE] = {
                quote: '',
                mentions: [],
            };
        }
        mentionsByGroup[id][sent].mentions.push(mi);
        return p;
    }, {} as { [ key: string ]: string });
    return { mentionsByGroup, uniqueMentionsAndTitles };
}

export function measureAndCombineSimiliarMentions<T = IMention>(article: { [ sent: string ]: { quote: string, mentions: T[] } }) {
    const sentences = Object.keys(article);
    const threshold = 0.85;
    for (let i = 0; i < sentences.length; i++) {
        const sentenceA = article[sentences[i]];
        for (let j = i + 1; j < sentences.length; j++) {
            const sentenceB = article[sentences[j]];
            const quoteA = sentenceA.quote;
            const quoteB = sentenceB.quote;
            const rawSimilarity = levenshteinSimilarityPercentage(quoteA, quoteB);
            const rawJaro = jaroDistance(quoteA, quoteB);
            const rawWinkler = jaroWinklerDistance(quoteA, quoteB);
            // Sentences are too similar and should be considered duplicates
            if (rawSimilarity > threshold ||
                rawJaro > threshold ||
                rawWinkler > threshold) {
                // clLogger.debug(`Combining \n${sentenceA.quote}\n${sentenceB.quote}`);
                sentenceA.mentions.push(...sentenceB.mentions);
                delete article[sentences[j]];
                // Fix loop indexes
                sentences.splice(j, 1);
                j--;
            }
        }
    }
}

export function measureAndCombineSimiliarTitles(mentionsByGroup: IGroupedMentions) {
    const groups = Object.keys(mentionsByGroup);
    for (let i = 0; i < groups.length; i++) {
        const groupA = mentionsByGroup[groups[i]];
        for (let j = i + 1; j < groups.length; j++) {
            const groupB = mentionsByGroup[groups[j]];
            const { titleA, titleB, threshold } = assessThresholds(groupA[ENGLISH_TITLE].quote, groupB[ENGLISH_TITLE].quote);
            const rawSimilarity = levenshteinSimilarityPercentage(titleA, titleB);
            const rawJaro = jaroDistance(titleA, titleB);
            const rawWinkler = jaroWinklerDistance(titleA, titleB);
            // Group B has the 'same' (duplicate) title as Group A
            if (rawSimilarity > threshold ||
                rawJaro > threshold ||
                rawWinkler > threshold) {
                // clLogger.debug(`Combining \n${titleA}\n${titleB}`);
                const sentences = Object.keys(groupB);
                for (const s of sentences) {
                    if (groupA[s] !== undefined) {
                        groupA[s].mentions.push(...groupB[s].mentions);
                    }
                    else {
                        groupA[s] = {
                            quote: groupB[s].quote,
                            mentions: groupB[s].mentions,
                        };
                    }
                }
                // Fix loop indexes
                delete mentionsByGroup[groups[j]];
                groups.splice(j, 1);
                j--;
            }
        }
    }
}

function assessThresholds(titleA: string, titleB: string) {
    let threshold = 0.75;
    if (titleA.indexOf('Earnings Call Transcript')) {
        titleA = titleA.replace('Earnings Call Transcript', '').replace(/\d\d\d\d|Q\d/g, '');
        threshold = 0.85;
    }
    if (titleB.indexOf('Earnings Call Transcript')) {
        titleB = titleB.replace('Earnings Call Transcript', '').replace(/\d\d\d\d|Q\d/g, '');
        threshold = 0.85;
    }
    return { titleA, titleB, threshold };
}
