#!/usr/bin/python3

# name:    scaffold_seeker.py
# author:  nbehrnd@yahoo.com
# license: GPLv3, 2021
# date:    2021-05-06 (YYYY-MM-DD)
# edit:
#
"""
Python script scaffold_seeker.py reports the number of Murcko scaffolds
and the SMILES strings of them per 'plate ID'.


The script is written for the CLI of Python 3.9.2, known to work with
the non-standard library RDKit (version 2020.09.4) by the call of

python3 scaffold_seeker.py [input_file.smi]

to yield [input_file_analysis.txt] as permanent record.  In Linux, the
provision of the executable bit (`chmod +x scaffold_seeker.py`) allows
to shorten the required input to

./scaffold_seeker.py [input_file.smi]

to generate the same output.  In other OSes (not tested), it might be
useful to adjust the shebang.


As mandatory parameter, the script expects a file, formatted as comma
separated list of the general line-wise syntax like

plate_abc01,spot_1,c1ccccc1,benzene_1

about the plate ID, the sample's location on this plate, the compound's
complete structure expressed as a SMILES string, and an individual
compound / library ID.  Script scaffold_seeker.py writes a permanent
record, each line in the pattern of

plate_abc03; 2; c1ccc2[nH]ccc2c1, c1cocn1

to state the plate ID, the number of Murcko scaffolds, and the SMILES
of these Murcko scaffolds.  In the output, the categories are separated
by a semicolon.  In case of multiple Murcko scaffolds identified, these
will be comma separated.
"""

import argparse
import os
import sys

import rdkit
from rdkit.Chem.Scaffolds import MurckoScaffold


def get_args():
    """Offer a minimal CLI menu."""
    parser = argparse.ArgumentParser(
        description=
        'Report number and SMILES of Murcko scaffolds identified per plate.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('positional',
                        metavar='FILE',
                        help='The comma separated input file to process.')

    return parser.parse_args()


def clearance_report():
    """Prepare a permanent record to be written."""
    args = get_args()
    input_file = args.positional

    report_file = str(input_file).split(".")[0] + str("_analysis.txt")

    # Ensure there is no file this analysis' results would be appended to:
    if os.path.isfile(report_file):
        try:
            os.remove(report_file)
        except OSError:
            print("Remove file '{}' manually and start again.  Exit.")
            sys.exit()

    return report_file


def identify_plates(input_file=""):
    """Recognize the ID of the plates used."""
    plate_ids = []

    with open(input_file, mode="r") as source:
        for line in source:
            current_plate_id = str(line).strip().split(",")[0]
            if current_plate_id not in plate_ids:
                plate_ids.append(current_plate_id)

    plate_ids.sort(key=str.lower)
    return plate_ids


def recognize_murcko_scaffolds(input_file=""):
    """Identify and report the Murcko scaffolds per plate ID."""
    args = get_args()
    input_file = args.positional
    plates = identify_plates(input_file=input_file)
    reporter = clearance_report()

    for plate in plates:
        print("Processing plate {}".format(plate))  # a CLI progress report
        murcko_scaffolds = []

        with open(input_file, mode="r") as source:
            for line in source:
                data = str(line).strip().split(",")

                if str(data[0]) == str(plate):
                    smiles_structure = str(data[2])
                    smiles_murcko_scaffold = str(
                        rdkit.Chem.Scaffolds.MurckoScaffold.
                        MurckoScaffoldSmiles(smiles_structure))

                    if smiles_murcko_scaffold not in murcko_scaffolds:
                        murcko_scaffolds.append(smiles_murcko_scaffold)

        # store the results in a permanent record:
        murcko_scaffold_report = ""
        murcko_scaffold_report = ", ".join(murcko_scaffolds)
        report_per_plate = str("{}; {}; {}".format(plate,
                                                   len(murcko_scaffolds),
                                                   murcko_scaffold_report))

        # output to the CLI:
        # print(report_per_plate)

        # output into a permanent record file:
        try:
            with open(reporter, mode="a") as newfile:
                newfile.write("{}\n".format(report_per_plate))
        except OSError:
            print("Report generation about file '{}' failed.  Exit.".format(
                reporter))
            sys.exit()

    print("\nResults of processing are stored in file '{}'.".format(reporter))


def main():
    """Join the functions."""
    recognize_murcko_scaffolds()


if __name__ == "__main__":
    main()
