"""
Given a column of text in unicode, apply quantitative metrics over the top
and return in a column vector at per document frequency
NB: these metrics take whole words, not stemmed words
"""
import numpy as np
import pandas as pd
import re
from nltk import tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
import configparser
import src.txtutils as txtutil
import src.dictionaryclass as rfdictcls

# ---------------------------------------------------------------------------
# Settings
# ---------------------------------------------------------------------------
config = configparser.ConfigParser()
config.read("config.ini")
my_stop_words = txtutil.getStopwords()
refDictsInstance = rfdictcls.RefDictClass()
tokenizer = RegexpTokenizer(r"\w+")
sentenceEnders = re.compile(
    r"""
                # Split sentences on whitespace between them.
                (?:               # Group for two positive lookbehinds.
                (?<=[,.!?])      # Either an end of sentence punct,
                | (?<=[,.!?]['"])  # or end of sentence punct and quote.
                )                 # End group of two positive lookbehinds.
                (?<!  Mr\.   )    # Don't end sentence on "Mr."
                (?<!  Mrs\.  )    # Don't end sentence on "Mrs."
                (?<!  Jr\.   )    # Don't end sentence on "Jr."
                (?<!  Dr\.   )    # Don't end sentence on "Dr."
                (?<!  Prof\. )    # Don't end sentence on "Prof."
                (?<!  Sr\.   )    # Don't end sentence on "Sr."
                (?<!  etc\.   )    # Don't end sentence on "Mr."
                \s+ """,  # Split on whitespace between sentences.
    re.IGNORECASE | re.VERBOSE,
)
# =============================================================================
# Functions to be run on text
# =============================================================================


def __sum_sent(sentDict, words):
    """
    Given an input dictionary connecting words to scores, output the overall
    level of sentiment.
    """
    sumSent = 0.0
    for word in words:
        if word in sentDict:
            sumSent = sentDict[word] + sumSent
    return sumSent


def __sentiment_calculate(dfColIn, dict_pos, dict_neg):
    """
    Calculates sentiment score of text
    in:
        dataframe column of text
        positive word dictionary
        negative word dictionary
    out:
        sentiment or uncertainty score
    """
    words = dfColIn.apply(lambda x: x.split())
    pos = words.apply(lambda x: __sum_sent(dict_pos, x))
    neg = words.apply(lambda x: __sum_sent(dict_neg, x))
    length = words.apply(lambda x: np.float(len(x)))
    return np.nan_to_num((pos + neg).divide(length))


def tf_idf(dfColIn, search_term):
    """
    Term frequency--inverse document frequency. This uses the
    ln(1+freq)/ln(1+N/n_t) form. N is number of articles per day.
    n_t is number of docs per day which have that term in.
    Only uses counts on each day

    This only plays well with a valid datetime index
    with no NaT values
    """
    # whether each article contains the term
    uIndex = dfColIn.str.contains(search_term)
    # for each, how many times it occurs
    countTerm = dfColIn.str.count(search_term)
    # All words per article
    countTot = dfColIn.apply(lambda x: len(x.split()))
    rawFreq = countTerm / countTot
    rawFreq = rawFreq.fillna(0.0)
    # Series has one entry per article. Want to divide it by
    # a vector which is as long but only has one unique entry per day
    # So create a vector of per day word counts and then rep entries
    # according to how many articles there are per day.
    # No. articles per day:
    articles_per_day = uIndex.groupby(pd.Grouper(freq="D")).count()
    # Row for each article showing number of counts of articles with that
    # term in on each day
    counts_art_vec = np.repeat(
        uIndex.groupby(pd.Grouper(freq="D")).sum(), articles_per_day
    )
    # Return 1/n_t for row that is on day when there are n_t articles
    # with that word in
    one_over_n_t = np.ones(shape=len(counts_art_vec)) / counts_art_vec
    one_over_n_t[one_over_n_t == np.inf] = 0.0
    # Total articles per day, repeated to shape of input rows
    # (i.e. show total number of articles for the whole day
    #  on the rows for articles published on that day)
    N = np.repeat(articles_per_day, articles_per_day)
    idf_vec = np.multiply(N, one_over_n_t)
    series = (np.log(rawFreq.values + 1.0)) / (np.log(1.0 + idf_vec.values))
    series = pd.DataFrame(series, index=rawFreq.index)
    series[series == np.inf] = np.nan
    series = series.fillna(0.0)[0]
    return series


def tf_idf_econom(dfColIn):
    """
    Want this measure to increase when sentiment is positive, therefore
    need to flip its sign
    """
    return -tf_idf(dfColIn, search_term="econom")


def tf_idf_uncertain(dfColIn):
    return tf_idf(dfColIn, search_term="uncertain")


def word_count(dfColIn, word):
    """
    Counts occurrences of one term
    """
    return dfColIn.str.count(word)


def word_count_econom(dfColIn):
    """
    Want this measure to increase when sentiment is positive, therefore
    need to flip its sign
    """
    return -word_count(dfColIn, "econom")


def word_count_uncertain(dfColIn):
    return word_count(dfColIn, "uncertain")


def baker_bloom_davis(dfColIn):
    """
    Implements the UK version of the Baker, Bloom and Davis boolean search.

    Global: dependent on total number of article
        - again could switch to division by cumulative count

    """
    # Leak-proof version
    EList = ["economic", "economy"]
    UList = ["uncertainty", "uncertain"]
    PList = [
        "spending",
        "policy",
        "deficit",
        "budget",
        "tax",
        "regulation",
        "bank of england",
    ]
    strNames = ["EList", "UList", "PList"]
    bbd_df = pd.DataFrame(
        data=[[False, False, False]] * len(dfColIn.index),
        index=dfColIn.index,
        columns=strNames,
    )

    for j, LetterList in enumerate([EList, UList, PList]):
        for term in LetterList:
            pattern = "|".join(LetterList)
            bbd_df[strNames[j]] = bbd_df[strNames[j]] | dfColIn.str.contains(pattern)
    counter = 0
    bbd_df["BBD"] = True
    while counter < 3:
        bbd_df["BBD"] = bbd_df["BBD"] & bbd_df[strNames[counter]]
        counter = counter + 1
    bbd_df["BBD"] = bbd_df["BBD"].astype(int)
    # divide with the cumulative number of articles on each day
    # find the rolling number of occurrence of articles that contain terms
    articlesperday = bbd_df["BBD"].groupby(pd.Grouper(freq="D")).count()
    activeCounts = bbd_df.groupby(pd.Grouper(freq="D")).sum()["BBD"]
    expandedDailyCounts = np.repeat(activeCounts.values, articlesperday.values)
    expandedDailyCounts = pd.DataFrame(expandedDailyCounts, index=bbd_df.index)[0]
    expandedDailyCounts[expandedDailyCounts == np.inf] = np.nan
    bbd_df["divisor"] = expandedDailyCounts
    bbd_df["BBD"] = (
        bbd_df["BBD"]
        .divide(bbd_df["divisor"])
        .replace([-np.inf, np.inf], np.nan)
        .fillna(0.0)
    )
    return bbd_df["BBD"]


def husted(dfColIn):
    """
    Apply the UK version of the Husted et al.(2016)
    - Monetary Policy Uncertainty Index-.

    Global dependence on total number of articles containing specific string
     - could fix by dividing by cumulative no. articles containing specific
     string

    """
    # Version which uses only information up to that time
    MList = [
        "monetary policy",
        "monetary policies",
        "interest rate",
        "interest rates",
        "bank rate",
    ]
    UList = ["uncertainty", "uncertain"]
    PList = ["bank england"]
    strNames = ["MList", "UList", "PList"]
    husted_df = pd.DataFrame(
        data=[[False, False, False]] * len(dfColIn.index),
        index=dfColIn.index,
        columns=strNames,
    )
    for j, LetterList in enumerate([MList, UList, PList]):
        for term in LetterList:
            pattern = "|".join(LetterList)
            husted_df[strNames[j]] = husted_df[strNames[j]] | dfColIn.str.contains(
                pattern
            )
    # find the number of occurrence of articles that contain PList terms
    articlesperday = husted_df["PList"].groupby(pd.Grouper(freq="D")).count()
    activeCounts = husted_df.groupby(pd.Grouper(freq="D")).sum()["PList"]
    expandedDailyCounts = np.repeat(activeCounts.values, articlesperday.values)
    expandedDailyCounts = pd.DataFrame(expandedDailyCounts, index=husted_df.index)[0]
    expandedDailyCounts[expandedDailyCounts == np.inf] = np.nan
    counter = 0
    husted_df["Husted"] = True
    while counter < 3:
        husted_df["Husted"] = husted_df["Husted"] & husted_df[strNames[counter]]
        counter = counter + 1
    husted_df["Husted"] = husted_df["Husted"].astype(int)
    husted_df["Husted"] = husted_df["Husted"].divide(expandedDailyCounts).fillna(0.0)
    husted_df["Husted"] = (
        husted_df["Husted"].replace([np.inf, -np.inf], np.nan).fillna(0.0)
    )
    return husted_df["Husted"]


def alexopoulos(dfColIn):
    """
    Calculates sentiment score of text
    according to Alexopoulos, M., & Cohen, J. (2009).
    Uncertain times, uncertain measures.
    University of Toronto Department of Economics Working Paper, 352.
    in:
        dataframe column of text
    out:
        sentiment or uncertainty score
    """
    EList = ["economic", "economy"]
    UList = ["uncertainty", "uncertain"]
    strNames = ["EList", "UList"]
    al_df = pd.DataFrame(
        data=[[False, False]] * len(dfColIn.index),
        index=dfColIn.index,
        columns=strNames,
    )

    for j, LetterList in enumerate([EList, UList]):
        for term in LetterList:
            pattern = "|".join(LetterList)
            al_df[strNames[j]] = al_df[strNames[j]] | dfColIn.str.contains(pattern)

    counter = 0
    al_df["Alexopoulos_09"] = True
    while counter < 2:
        al_df["Alexopoulos_09"] = al_df["Alexopoulos_09"] & al_df[strNames[counter]]
        counter = counter + 1
    al_df["Alexopoulos_09"] = al_df["Alexopoulos_09"].astype(int)
    return al_df["Alexopoulos_09"]


def nyman(dfColIn):
    """
    Calculate Nyman dictionary based measure of sentiment (excitment-anxiety)/
    length
    """
    return __sentiment_calculate(
        dfColIn, refDictsInstance.ny_excite_dict, refDictsInstance.ny_anxiety_dict
    )


def opinion(dfColIn):
    """
    Calculate Liu, Hu and Cheng dictionary. "Opinion Observer: Analyzing
    and Comparing Opinions on the Web."
    """
    return __sentiment_calculate(
        dfColIn, refDictsInstance.op_positive_dict, refDictsInstance.op_negative_dict
    )


def harvard(dfColIn):
    """
    Calculate Harvard inquirer dictionary based measure of sentiment
    (positive-negative)/length
    """
    return __sentiment_calculate(
        dfColIn, refDictsInstance.harv_pos_dict, refDictsInstance.harv_neg_dict
    )


def loughran(dfColIn):
    """
    Calculate Loughran and McDonald dictionary based measure of sentiment
    (positive-negative)/length
    """
    return __sentiment_calculate(
        dfColIn, refDictsInstance.lm_pos_dict, refDictsInstance.lm_neg_dict
    )


def afinn(dfColIn):
    """
    Calculate Afinn dictionary based measure of sentiment
    (positive-negative)/length
    """
    return __sentiment_calculate(
        dfColIn, refDictsInstance.afinn_pos_dict, refDictsInstance.afinn_neg_dict
    )


def stability(dfColIn):
    """
    Calculate the financial stability dictionary based measure of sentiment
    (positive-negative)/length
    """

    return __sentiment_calculate(
        dfColIn,
        refDictsInstance.stability_pos_dict,
        refDictsInstance.stability_neg_dict,
    )


def vader(dfColIn):
    """
    vader sentiment: Rule based VADER:
    A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text
        C.J. Hutto Eric Gilbert (2014)


    """
    sid = SentimentIntensityAnalyzer()
    sentimentArray = np.zeros(len(dfColIn))
    # Try not to use i, j for anything other than integers: just a convention
    # but one which makes code easier to read
    for i, entry in enumerate(dfColIn):
        # create a list with the sentence of each article
        lines_list = tokenize.sent_tokenize(entry)
        no_sentences = len(lines_list)
        ave_compound = np.zeros(no_sentences)
        if no_sentences > 0:  # Prevents crash in case of no_sentences=0
            for j, sentence in enumerate(lines_list):
                # get sentiment of each sentence and
                # store the normalized, weighted composite  sentiment score
                ## stores all sentences sent
                ave_compound[j] = sid.polarity_scores(sentence).get("compound", "")
            # average sentiment of each article
            ave_sentiment = sum(ave_compound) / no_sentences
            sentimentArray[i] = ave_sentiment
    return sentimentArray


def punc_econom(dfColIn):
    """
    Uses the punctuation based splitter and econom words
    followed by the union of dicts approach
    """
    return punctuation_sentiment(dfColIn, ["econom"])


# =============================================================================
# sentiment from nouns
# =============================================================================
# What we want:
# 1. pass a noun or a list of nouns
# 2. search text for all matching nouns
# 3. surface all words which refer to the noun in question (could be adverbs,
# adjectives, anything)
#  e.g. [(x.text,x.pos_,x.dep_,[(x.text,x.dep_) for x in list(x.children)])
#  for x in text][-1]
#  ('brexit', 'NOUN', 'dobj', [('risky', 'amod')])
# For our example case, print out the related words
# Take the noun-related words and get their sentiment from the dictionaries
def __searchfornoun(dfColIn, nounstoUse, sentencesplit):
    """Private function  that calculates noun sentiment
    by measuring sentiment on a sentence basis
    - args: sentencesplit determines how to split sentences
    nounstoUse: a list of nouns to search on each article
    we perform coreference resolution first to the articles that the noun
    appears
    """
    if type(nounstoUse) != list:
        nounstoUse = [nounstoUse]
    listofnouns = [x.lower() for x in nounstoUse]
    nouns = "-".join(str(x) for x in nounstoUse)
    df_1 = pd.DataFrame(columns=[nouns], index=dfColIn.index)
    pattern = "|".join(listofnouns)
    dfColIn = dfColIn.str.lower()

    for i, row in dfColIn.iteritems():
        if any(pd.Series(row).astype(str).str.contains(pattern)):
            sent_text = sentencesplit(str(row))
            sentences = [x for x in sent_text]
            final_str = []
            for sent in sentences:
                for noun in listofnouns:
                    if noun in sent:
                        ex = re.compile(r"(@|{})".format(noun))
                        final_str.append(re.sub(ex, "", str(sent)))
            stringtouse = " ".join(final_str)
            df_1.at[i, nouns] = stringtouse
        else:
            df_1.at[i, nouns] = " "
    # Text cleaning goes here: remove punctuation
    df_1["cleaned" + nouns] = df_1[nouns].apply(
        lambda x: txtutil.clean_text(
            x, lemmatize=False, norm_case=False, stem=False, rm_stopwords=True
        )
    )
    # find  sentiment
    return __sentiment_calculate(
        df_1["cleaned" + nouns],
        refDictsInstance.unique_dict_pos,
        refDictsInstance.unique_dict_neg,
    )


def word_count_list(dfColIn, words):
    if type(words) != list:
        listname = [words]
    pattern = "|".join(listname)
    return dfColIn.str.count(pattern)


def sentence_sentiment(dfColIn, listofnouns):
    """function that calculates noun sentiment:
    Steps: grabs the articles where noun appears
    finds the sentiment using only the sentences
    """
    return __searchfornoun(dfColIn, listofnouns, sentencesplit=tokenize.sent_tokenize)


def punctuation_sentiment(dfColIn, listofnouns):
    """function that calculates noun sentiment:
    Steps: grabs the articles where noun appears
    finds the sentiment using only the sentences
    ••• here we take punctuation into account for the
    determination of a sentence
    """
    return __searchfornoun(dfColIn, listofnouns, sentencesplit=sentenceEnders.split)
