#!/usr/bin/env python3
# Author :   Michael E. Rose <michael.ernst.rose@gmail.com>
"""Compare networks w.r.t. outliers."""

from glob import glob
from os.path import split, splitext

import networkx as nx
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from pybliometrics.scopus import AuthorRetrieval

ECONLIT_FOLDER = './100_Econlit_networks/'
SCOPUS_FOLDER = './200_Scopus_networks/'
OUTPUT_FOLDER = './990_output/'


def analyze_network(fname, quantile=0.01):
    """Analyze network and return count of publications."""
    # Variables
    parts = split(fname)
    year = splitext(parts[-1])[0]
    dataset = parts[0].split("_")[1]
    # Read network
    G = nx.read_gexf(fname)
    pubs = nx.get_node_attributes(G, "Number of publications")
    pub_counts = pubs.values()
    # Save values of upper 1%
    new = pd.DataFrame({"Publications": list(pub_counts)})
    new["Year"] = year + "s"
    new["Dataset"] = dataset
    upper = new["Publications"].quantile(1-quantile)
    new = new[new["Publications"] >= upper]
    # Print 3 individuals w/ most publications
    new = new.sort_values("Publications")
    top_three_cutoff = new.tail(3)["Publications"].min()
    top_three = [n for n, v in pubs.items() if v >= top_three_cutoff]
    if dataset == "Scopus":
        top_three = [get_name(auth_id) for auth_id in top_three]
    print(f"... {dataset} {year}s:", "-".join(top_three))
    return new


def get_name(auth_id):
    """Retrieve an author's name from Scopus."""
    au = AuthorRetrieval(auth_id)
    return f"{au.surname} {au.given_name}"


def make_box_comparison(df, fname, x="Year", y="Publications", size=(8, 5)):
    """Plot boxplots showing distribution of Publications
    by year and dataset.
    """
    # Plot
    fig, ax = plt.subplots(figsize=size)
    sns.boxplot(ax=ax, x=x, y=y, hue="Dataset", data=df,
                palette=["#a99c73", '#9c73a9'])
    # Aesthetics
    ax.set(xlabel="")
    # Save
    plt.savefig(fname, bbox_inches="tight")
    plt.close()


def main():
    # Read networks
    print(">>> Three most prolific authors by network:")
    df = pd.DataFrame()
    for f in sorted(glob(SCOPUS_FOLDER + "*.gexf") + glob(ECONLIT_FOLDER + "*.gexf")):
        df = df.append(analyze_network(f))

    # Plot distribution
    fname = OUTPUT_FOLDER + "Figures/dataset_comparison.pdf"
    make_box_comparison(df, fname)


if __name__ == '__main__':
    main()
