"""
This takes all Newspaper-*.csv files and merges them into a single
Newspaper.csv file, where "Newspaper" is one of the names of a newspaper.

It also de-dupes newspaper files.
"""
import os
import pandas as pd
import glob

path = "DataSources/DowJones"  # input/output path
all_rec = glob.glob(os.path.join(path, "*.csv"), recursive=True)
dataframes = (pd.read_csv(f) for f in all_rec)
big_dataframe = pd.concat(dataframes, ignore_index=True)


for name, group in big_dataframe.groupby("Newspaper"):
    print(name)
    try:
        group.to_csv("{}.csv".format(name), index=False)
    except UnicodeEncodeError:
        print("FAILED")
    continue

# Dropping duplicates from the created newspapers
NewspapersList = ["GRDN", "T000", "DMIR", "THES", "DAIM"]
for name in NewspapersList:
    with open("duplicated_" + name + ".txt", "a") as inputfile:
        newdf = pd.read_csv(os.path.join(path, name) + ".csv")
        newdf[name + "_full"] = (
            newdf[["title", "snippet", "text"]].astype(str).apply(" ".join, 1)
        )
        original_num = len(newdf)
        newdf = newdf[
            ~newdf.action.str.contains("rep") == True
        ]  # many articles are dropped
        lenghtwith_dup = len(newdf)
        print("Length with the duplicates: " + str(lenghtwith_dup))
        newdf.index = pd.to_datetime(newdf["date"])
        newdf.drop_duplicates([name + "_full"], keep="first", inplace=True)
        nondup = len(newdf)
        duplicates = lenghtwith_dup - nondup
        print("Length without the duplicates:" + str(nondup))
        mes = (
            "Total number of articles for the "
            + name
            + " are: "
            + str(original_num)
            + "\n"
            + "Number without repetition:"
            + str(lenghtwith_dup)
            + "\n"
            + "Total number of identical duplicates"
            + str(duplicates)
        )
        newdf.to_csv("{}_non_dup.csv".format(name), index=False)
        print(mes)
        inputfile.write(mes)
