#!/usr/bin/env python3
# Author(s):  Nurzhan Sapargali <sapargalin95@gmail.com>
#             Michael E. Rose <Michael.Ernst.Rose@gmail.com>
"""Computes network statistics for coauthor networks akin to Table 1."""

from glob import glob
from collections import Counter, OrderedDict
from math import sqrt
from os.path import basename, splitext

import networkx as nx
import pandas as pd
from numpy import average, std

ORIGINALECONLIT_FOLDER = './005_original_pajek/'
MYECONLIT_FOLDER = './100_EconLit_networks/'
SCOPUS_FOLDER = './200_Scopus_networks/'
OUTPUT_FOLDER = './990_output/'


def analyze_network(fname):
    """Analyze network w.r.t. to nodes, edges, clustering and path length."""
    # Variables
    if fname.endswith("gexf"):
        G = nx.read_gexf(fname)
    else:
        G = nx.read_pajek(fname)
        G = nx.Graph(G)
    row = OrderedDict()
    # Degree-based
    total = len(G)
    row['Total authors'] = total
    degrees = [i[1] for i in G.degree()]
    row['Degree: Average'] = average(degrees)
    row['Degree: Standard deviation'] = std(degrees)
    # Component-based
    comps = sorted(nx.connected_components(G), key=len, reverse=True)
    H = comps[0]  # Giant component
    row['Giant Component: Size'] = len(H)
    row['Giant Component: Percentage'] = len(H)/total
    row['Second largest component'] = len(comps[1])
    del comps
    isolated = Counter(degrees)[0]
    row['Isolated authors: Number'] = isolated
    row['Isolated authors: Percentage'] = isolated/total
    # Clustering
    row['Clustering coefficient'] = nx.average_clustering(G)
    # Shortest paths
    dist_hist = Counter()
    for source, d in nx.all_pairs_shortest_path_length(G.subgraph(H)):
        for target, length in d.items():
            dist_hist[length] += 0.5
    dist_hist.pop(0)
    weights = list(dist_hist.values())
    avg_sh_dist = average(list(dist_hist.keys()), weights=weights)
    var_sh_dist = average([(k - avg_sh_dist)**2 for k in dist_hist.keys()],
                          weights=weights)
    row['Distance in giant component: Average'] = avg_sh_dist
    row['Distance in giant component: Standard deviation'] = sqrt(var_sh_dist)
    # Finalize
    return pd.Series(row)


def main():
    print(">>> Now working on:")
    paths = (MYECONLIT_FOLDER, ORIGINALECONLIT_FOLDER, SCOPUS_FOLDER)
    labels = ("Econ", "EconO", "Scopus")
    for label, folder in zip(labels, paths):
        print(f"... {label}")
        # Analyze networks
        table1 = pd.DataFrame()
        files = [f for f in glob(folder + "*") if not f.endswith(".md")]
        for f in files:
            year = splitext(basename(f))[0]
            print("  -", year)
            table1[year + "s"] = analyze_network(f)
        # Write out
        fname = f"{OUTPUT_FOLDER}/Tables/{label}_1.tex"
        table1.iloc[:, :3].to_latex(fname, float_format=lambda x: '{:,.3f}'.format(x))
        fname = f"{OUTPUT_FOLDER}/Tables/{label}_1_long.tex"
        table1.iloc[:, 3:].to_latex(fname, float_format=lambda x: '{:,.3f}'.format(x))


if __name__ == '__main__':
    main()
