#!/usr/bin/env python3
# Authors:   Michael E. Rose <michael.ernst.rose@gmail.com>
"""Creates co-author networks for three separate decades using EconLit."""

from collections import defaultdict
from itertools import combinations
from glob import glob

import networkx as nx
import pandas as pd
from num2words import num2words
from RISparser import readris

SOURCE_FOLDER = "./000_EconLit_data/"
MAPPING_FILE = "./020_mapping/EconLit.csv"
COVERAGE_FOLDER = "./050_coverage/"
TARGET_FOLDER = "./100_EconLit_networks/"
OUTPUT_FOLDER = './990_output/'

DECADES = ("197", "198", "199", "200", "201")


def clean_name(name):
    """Clean name such that only surname and first name initials remain."""
    if len(name) > 40:
        return None
    try:
        surname, firstnames = name.split(", ", 1)
        initials = "".join([c for c in firstnames.split(",")[0] if c.isupper()])
        if "," in firstnames:
            initials += " Jr"
        return " ".join([surname, initials]).lower()
    except ValueError:  # Name does not contain comma
        return name


def read_econlit_file(fname):
    """Read EconLit publication list."""
    data = {}
    with open(fname, "r") as inf:
        for entry in readris(inf):
            key = entry["primary_title"]
            if key.endswith(": Correction"):
                continue
            try:
                data[key] = {"authors": entry['authors'],
                             "year": entry["publication_year"],
                             "journal": entry['journal_name']}
            except KeyError:
                continue
    df = pd.DataFrame.from_dict(data).T
    df["year"] = df["year"].str.split("/").str[0]
    df.index.name = "title"
    return df


def write_stats(stat_dct):
    """Write out textfiles as "filename: content" pair."""
    for key, cont in stat_dct.items():
        fname = f"{OUTPUT_FOLDER}/Statistics/{key}.txt"
        with open(fname, "w") as out:
            out.write(f"{cont:,}")


def main():
    journal_map = pd.read_csv(MAPPING_FILE, index_col=0)["today"].to_dict()
    pnas = "Proceedings of the National Academy of Sciences of the United States of America"
    pnas_articles = {}

    stats = {}
    print(">>> Now working on:")
    volumes = pd.DataFrame()
    for decade in DECADES:
        print(f"... {decade}0s")
        dec_name = num2words(decade[-1] + "0")
        # Read documents
        files = [f for f in glob(SOURCE_FOLDER + "*.ris") if decade in f]
        df = pd.concat([read_econlit_file(f) for f in files], axis=0)
        # Collect PNAS articles
        pnas_articles[decade + "0"] = df[df["journal"] == pnas].index.tolist()
        # Compute statistics
        n_pubs = df.shape[0]
        new_volumes = df.groupby(["journal", "year"]).size().reset_index()
        new_volumes = new_volumes[new_volumes["journal"] != pnas]
        df = df.drop(["journal", "year"], axis=1)
        new_volumes["journal"] = new_volumes["journal"].replace(journal_map)
        new_volumes = new_volumes.groupby(["journal", "year"]).sum().reset_index()
        volumes = volumes.append(new_volumes)
        n_volumes = new_volumes.shape[0]
        print(f" -  {n_pubs:,} publications in {n_volumes:,} volumes")
        stats[f"Econ_N_of_pubs_{dec_name}s"] = n_pubs
        stats[f"Econ_N_of_volumes_{dec_name}s"] = n_volumes
        # Compute meta information & clean author names
        pub_count = defaultdict(int)
        pubco_count = defaultdict(int)
        auth_groups = []
        for auth_group in df["authors"]:
            authors = [clean_name(a) for a in auth_group]
            authors = list(filter(None, authors))
            auth_groups.append(authors)
            for auth in authors:
                pub_count[auth] += 1
                if len(authors) > 1:
                    pubco_count[auth] += 1
        # Generate author links
        print(f" -  {len(pub_count):,} distinct authors")
        stats[f"Econ_N_of_authors_{dec_name}s"] = len(pub_count)
        combs = [list(combinations(i, 2)) for i in auth_groups]
        edges = [i for j in combs for i in j]
        # Populate network
        G = nx.Graph(name=decade)
        G.add_nodes_from(pub_count.keys())
        G.add_edges_from(edges)
        nx.set_node_attributes(G, pub_count, "Number of publications")
        nx.set_node_attributes(G, pubco_count, "Number of co-authored publications")
        # Write out
        fname = f"{TARGET_FOLDER}/{decade}0.gexf"
        nx.write_gexf(G, fname)

    # Save actual EconLit coverage
    volumes.columns = ["journal", "year", "pub_count"]
    volumes = volumes.sort_values(["journal", "year"])
    volumes.to_csv(COVERAGE_FOLDER + "EconLit_actual.csv", index=False)

    # Save PNAS articles
    pnas = pd.DataFrame.from_dict(pnas_articles, orient="index").T
    pnas.to_csv(COVERAGE_FOLDER + "EconLit_PNAS.csv", index=False)

    # Statistics
    write_stats(stats)


if __name__ == '__main__':
    main()
