#!/usr/bin/python
# -*- coding: utf-8 -*-

#~ trendtest
#~ Copyright (C) 2013 Interbull Centre
#~
#~ This program is free software: you can redistribute it and/or modify
#~ it under the terms of the GNU General Public License as published by
#~ the Free Software Foundation, either version 3 of the License, or
#~ (at your option) any later version.
#~
#~ This program is distributed in the hope that it will be useful,
#~ but WITHOUT ANY WARRANTY; without even the implied warranty of
#~ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#~ GNU General Public License for more details.
#~
#~  http://www.gnu.org/licenses/
#
# program trendtest1.py

'''
Perform trend validation by method 1 for one breed-population combination,
for all traits.
'''

# Revision history:
# 2013.10.21 GJansen - original version

import os
import sys
import argparse
from datetime import date
import numpy as np
import ibutils

testdate = date.today().strftime('%Y%m%d')

# to see help summary: python trendtest1.py --help
epilog = 'See detailed instructions at: '\
    'https://wiki.interbull.org/public/TrendTest_Software?action=print'''

# see http://docs.python.org/2.7/howto/argparse.html
parser = argparse.ArgumentParser(epilog=epilog)
parser.add_argument('brd',
                    help='evaluation breed code (BSW/GUE/JER/HOL/RDC/SIM)')
parser.add_argument('pop',
                    help='population code (same as country code except for'\
                        ' CHR/DEA/DFS/FRR/FRM)')
parser.add_argument('datadir',
                    help='absolute or relative path to data files')
parser.add_argument('-v', '--verbose', action='store_true',
                    help='increase output verbosity')
parser.add_argument('-c', '--controlfile', default=None,
                    help='path/name of the control file (default=DATADIR/'
                    'file305_POPBRD)')
parser.add_argument('-m', '--mergefiles', action='store_true',
                    help='write merged data files (for independent data'
                    ' checks)')
parser.add_argument('-M', '--mergedir',
                    help='absolute or relative path for merged data files'
                    ' (default=DATADIR/merged1)')
args = parser.parse_args()

brd = ibutils.check_breed(args.brd)
pop = args.pop.upper()
_POPBRD = '_' + pop + brd

if not os.path.exists(args.datadir):
    print('absolute DATADIR: ' + os.path.abspath(args.datadir))
    print('%s: error: DATADIR does not exist or has incorrect permissions'
          % sys.argv[0])
    sys.exit(1)
if args.mergefiles:
    mergedir = os.path.join(args.datadir, 'merged1')
    mergedir = os.path.abspath(args.mergedir if args.mergedir else mergedir)
    if not os.path.exists(mergedir):
        os.makedirs(mergedir)

#============================================================
os.chdir(args.datadir)                  # NB! move to DATADIR
#============================================================

filebd = 'bdate' + _POPBRD
file305 = args.controlfile if args.controlfile else ('file305' + _POPBRD)
fileout = 'file311' + _POPBRD
filelog = 'tt1' + _POPBRD + '.log'
if args.verbose:
    print('%s: writing log to %s/%s' % (sys.argv[0], args.datadir, filelog))
log = open(filelog, 'w')

ibutils.dated_msg(sys.argv[0]+': start', log=log)
log.write(sys.argv[0] + ' version=' + ibutils.version + '\n')

if args.verbose:
    log.write('absolute DATADIR: %s\n' % os.path.abspath(args.datadir))


log.write('Processing brd=%s pop=%s datadir=%s\n'
      % (brd, pop, args.datadir))

#------------------------------------------------------------------------------
# read and store birth years
byear = {}
for rec in open(filebd):
    aid = rec[:19]
    byear[aid] = rec[20:24].replace(' ', '0')
if args.verbose:
    log.write('stored %6d records from file %s\n' % (len(byear), filebd))


#------------------------------------------------------------------------------
# process files trait by trait, in the order listed in the trait info file
if args.verbose:
    log.write('opening output file %s ...\n' % fileout)
fout = open(fileout, 'w'); first_rec = True
for rec in open(file305):
    if rec[0] == '#': # skip header line
        continue
    try:
        trtg, trt, evdate, herit, siresd, merit, type2x, min_hrd, min_dgh, \
            byr1, miny, maxy, corr, yy_mon, chg = rec.strip().split()
    except:
        print('error: could not parse this line from file ' + file305)
        print('line ::', rec)
        print('please review the documentation')
        sys.exit(9)
    merit = merit.upper()
    if merit not in ['B+', 'B-', 'T+', 'T-']:
        print('error: genetic merit must be B+/B-/T+/T- or TA in file305')
        print('line ::', rec)
        sys.exit(9)
    bvta = 'BV' if merit[0] == 'B' else 'TA'
    sdg = float(siresd) * (2. if bvta == 'TA' else 1.)

    min_hrd = int(min_hrd)
    min_dgh = int(min_dgh)
    min_byear = byr1

    if args.verbose:
        log.write('\n\n' + '-'*85 + '\n')
        log.write('processing trt byear herds daus type2x bvta     sdg\n')
        log.write('inputs     %s %s  %5d%5d    %s    %s  %s\n' %
                  (trt,min_byear,min_hrd,min_dgh,type2x,merit,sdg))
        log.write('-'*85 + '\n')
    #--------------------------------------------------------------------------
    # read and store all lactation and first lactation data for this trait
    dataAL, log = ibutils.read_file300(brd, pop, trt, 'file300' + _POPBRD,
                                       args, log)
    dataFL, log = ibutils.read_file300(brd, pop, trt, 'file300FL' + _POPBRD,
                                       args, log)
    #==========================================================================
    # CREATE EDITED DATA FOR FIRST/ALL LACTATION PROOFS
    x=[]; y1=[]; y2=[]; w1=[]; w2=[]
    empty = ('??', '?', '??', 0, 0, 0., 0., 0.)
    # process bulls on Cf file and look for stored records from other datasets
    # - edc/rel/"ebv" are zero if no record is found for other datasets
    for aid in dataAL:
        byr = byear.get(aid, '0000')
        # first edit: skip any bulls born before min_byear
        if byr < min_byear:
            continue
        # top = type of proof (11, 12, 21 etc)
        # off = officially publishable proof (Y/N)
        # sta = bull status
        # nd = n. daughters
        # nh = n. herds
        top, off, sta, nd, nh, edc, rel, ebv = dataAL[aid]
        top1, off1, sta1, nd1, nh1, edc1, rel1, ebv1 = dataFL.get(aid, empty)

        # additional data edits
        keep = 'N'
        # -- is bull a domestic AI bull (or foreign bull for small populations)
        if (top in ['11', '12'] and sta != '20') or \
            (type2x == 'Y' and top in ['21', '22']):
            keep = 'Y'
        # -- does bull meet minimums for herds/daus/edc for first/all lactation
        if nh < min_hrd or nh1 < min_hrd:
            keep = 'N'
        if (nd<min_dgh and edc<min_dgh) or (nd1<min_dgh and edc1<min_dgh):
            keep = 'N'
        if keep == 'Y':
            x.append(int(byr))
            y1.append(ebv1)
            y2.append(ebv)
            w1.append(1.) # unweighted analysis
            w2.append(1.) # unweighted analysis
            # w1.append(edc1) # for weighted analysis
            # w2.append(edc) # for weighted analysis

        if args.mergefiles:
            # create a merged dataset for additional checks with SAS, R, etc.
            if len(x) <= 1:
                fmerge = open(os.path.join(mergedir, trt + '.csv'), 'w')
            fmerge.write(','.join((aid, byr, keep, top, off, sta,
                                   'AL', str(nd), str(nh), str(edc),
                                   str(rel), str(ebv),
                                   'FL', str(nd1), str(nh1), str(edc1),
                                   str(rel1), str(ebv1))))
            fmerge.write('\n')
    if args.mergefiles:
        fmerge.close()

    n = len(x)
    if n == 0:
        if args.verbose:
            log.write('warning: no merged records after edits for trait %s\n\n'\
                      % (trt))
        continue
    if not args.verbose: log.write('\n')
    log.write('\nSummary statistics for trait "%s" (N=%d bulls)\n' % (trt, n))
    log.write('-'*62 + '\n')
    log.write('Trait Variable         Mean       Std      Min      Max\n')
    log.write('-'*62 + '\n')
    fmt = '%s   %s %9.3f %9.3f %8.2f %8.2f\n'
    y2 = np.array(y2)
    log.write(fmt % (trt, 'ALL LACT EBV', y2.mean(), y2.std(ddof=1),
                     y2.min(), y2.max()))
    y1 = np.array(y1)
    log.write(fmt % (trt, '1ST LACT EBV', y1.mean(), y1.std(ddof=1),
                     y1.min(), y1.max()))
    x = np.array(x)
    log.write(fmt % (trt, 'BIRTHYEAR   ', x.mean(), x.std(ddof=1),
                     x.min(), x.max()))
    log.write('-'*62 + '\n')

    #---------------------------------------------------------------------------
    # calculate test stats
    # model T: EBVT = b0 + b2*BYEAR + e  (all lactation)
    bT, bseT, R2_T, rmseT = ibutils.simple_wls(x, y2, w2)
    # model 1: EBV1 = b0 + b1*BYEAR + e  (first lactation)
    b1, bse1, R2_1, rmse1 = ibutils.simple_wls(x, y1, w1)

    sdg = float(sdg)
    log.write('Regressions on year of birth\n')
    log.write('-'*62 + '\n')
    log.write('Trait Variable        Slope   se(b)  R^2(%)    RMSE\n')
    log.write('-'*62 + '\n')
    fmt = '%s   %s %8.3f %7.3f %6.1f %8.3f\n'
    log.write(fmt % (trt, 'ALL LACT EBV',  bT[1], bseT[1], 100.*R2_T, rmseT))
    log.write(fmt % (trt, '1ST LACT EBV',  b1[1], bse1[1], 100.*R2_1, rmse1))
    log.write('-'*62 + '\n')

    # check ranges of std's for first/all lactation proofs and supplied SD
    warnings = []
    if rmse1/rmseT < 0.85 or rmse1/rmseT > 1.05:
        log.write('WARNING for trait %s:\n => Ratio RMSE_1/RMSE_T outside'
              ' expected range (0.85 to 1.05).\n => Proofs for first and all'
              ' lactations need to be on the same scale.\n' % trt)
        warnings.append('LACT1_SCALE_WARNING')
    if merit[0] == 'B' and (rmseT/sdg < 0.7 or rmseT/sdg > 1.4):
        log.write('WARNING for trait %s:\n => Ratio RMSE_T/SDg outside'
              ' expected range (0.7 to 1.4).\n => SD of proofs on BV scale'
              ' should be roughly the same as SDg.\n' % trt)
        warnings.append('SDG_BV_WARNING')
    elif merit[0] == 'T' and (rmseT/sdg < 0.35 or rmseT/sdg > 0.7):
        log.write('WARNING for trait %s:\n => Ratio RMSE_T/SDg outside'
              ' expected range (0.35 to 0.7).\n => SD of proofs on TA scale'
              ' should be roughly half of SDg.\n' % trt)
        warnings.append('SDG_TA_WARNING')

    # method 1 pass or fail
    testval = abs(bT[1] - b1[1]) / sdg
    if (bvta == 'BV' and testval < 0.02) or (bvta == 'TA' and testval < 0.01):
        pass1 = 'PASS'
    else:
        pass1 = 'FAIL'
    log.write('\nMethod 1 test: abs(%0.3f-%0.3f)/%0.3f = %0.4f (%s) ==> %s for'
              ' trait %s\n' % (bT[1], b1[1], sdg, testval, bvta, pass1, trt))

    warnings = ','.join(warnings) if warnings else 'none'

    if first_rec:
        fout.write('rec brd pop tgrp trt testdate pass testval      SDg bv    '
                   'b_ALL    b_1ST bulls   stdALL   std1ST '
                   'x byr1 mh md warnings\n')
        first_rec = False
    fmt = '311 %s %s %s %s %s %s %7.3f %8.3f %s %8.3f %8.3f%6d %8.3f %8.3f'\
        ' %s %s %2d %2d %s\n'
    fout.write(fmt % (brd, pop, trtg, trt, testdate, pass1, testval, sdg, bvta,
                      bT[1], b1[1], n, y2.std(ddof=1), y1.std(ddof=1),
                      type2x, min_byear, min_hrd, min_dgh, warnings))
fout.close()

ibutils.dated_msg(sys.argv[0]+': end', log=log)
log.close()
