#!/usr/bin/python
# -*- coding: utf-8 -*-

#~ trendtest
#~ Copyright (C) 2013 Interbull Centre
#~
#~ This program is free software: you can redistribute it and/or modify
#~ it under the terms of the GNU General Public License as published by
#~ the Free Software Foundation, either version 3 of the License, or
#~ (at your option) any later version.
#~
#~ This program is distributed in the hope that it will be useful,
#~ but WITHOUT ANY WARRANTY; without even the implied warranty of
#~ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#~ GNU General Public License for more details.
#~
#~  http://www.gnu.org/licenses/
#
# program trendtest2.py

'''
Perform trend validation by method 2 for one breed-population combination,
for all traits.
'''

# Revision history:
# 2013.10.21 GJansen - original version

import os
import sys
import argparse
from datetime import date
import numpy as np
import ibutils

testdate = date.today().strftime('%Y%m%d')

# to see help summary: python trendtest2.py --help
epilog = 'See detailed instructions at: '\
    'https://wiki.interbull.org/public/TrendTest_Software?action=print'''

# see http://docs.python.org/2.7/howto/argparse.html
parser = argparse.ArgumentParser(epilog=epilog)
parser.add_argument('brd',
                    help='evaluation breed code (BSW/GUE/JER/HOL/RDC/SIM)')
parser.add_argument('pop',
                    help='population code (same as country code except for'\
                        ' CHR/DEA/DFS/FRR/FRM)')
parser.add_argument('datadir',
                    help='absolute or relative path to data files')
parser.add_argument('-v', '--verbose', action='store_true',
                    help='increase output verbosity')
parser.add_argument('-c', '--controlfile',
                    help='path/name of the control file (default=DATADIR/'
                    'file305_POPBRD)')
parser.add_argument('-m', '--mergefiles', action='store_true',
                    help='write merged data files (for independent data'
                    ' checks)')
parser.add_argument('-M', '--mergedir',
                    help='absolute or relative path for merged data files'
                    ' (default=DATADIR/merged2)')
args = parser.parse_args()

brd = ibutils.check_breed(args.brd)
pop = args.pop.upper()
_POPBRD = '_' + pop + brd

if not os.path.exists(args.datadir):
    print('absolute DATADIR: ' + os.path.abspath(args.datadir))
    print('%s: error: DATADIR does not exist or has incorrect permissions'
          % sys.argv[0])
    sys.exit(1)
if args.mergefiles:
    mergedir = os.path.join(args.datadir, 'merged2')
    mergedir = os.path.abspath(args.mergedir if args.mergedir else mergedir)
    if not os.path.exists(mergedir):
        os.makedirs(mergedir)

#============================================================
os.chdir(args.datadir)                  # NB! move to DATADIR
#============================================================

filelog = 'tt2' + _POPBRD + '.log'
if args.verbose:
    print('%s: writing log to %s/%s' % (sys.argv[0], args.datadir, filelog))
log = open(filelog, 'w')

ibutils.dated_msg(sys.argv[0]+': start', log=log)
log.write(sys.argv[0] + ' version=' + ibutils.version + '\n')

if args.verbose:
    log.write('absolute DATADIR: %s\n' % os.path.abspath(args.datadir))


log.write('Processing brd=%s pop=%s datadir=%s\n'
      % (brd, pop, args.datadir))

def bomb(msg, rc):
    log.write(msg); sys.stderr.write(msg); sys.exit(rc)

#------------------------------------------------------------------------------
# read and store birth years
byear = {}; filebd = 'bdate' + _POPBRD
for rec in open(filebd):
    aid = rec[:19]
    byear[aid] = rec[20:24].replace(' ', '0')
if args.verbose:
    log.write('stored %d records from file %s\n' % (len(byear), filebd))

#------------------------------------------------------------------------------
# process data trait by trait, in the order listed in the contol file
if args.verbose:
    log.write('opening output file %s ...\n' % ('file312' + _POPBRD))
file305 = args.controlfile if args.controlfile else ('file305' + _POPBRD)
fout = open('file312' + _POPBRD, 'w')
first_rec = True
for rec in open(file305):
    if rec[0] == '#': continue # skip header
    try:
        trtg, trt, evdate, herit, siresd, merit, type2x, min_hrd, min_dgh, \
            byr1, miny, maxy, R, yy_mon, chg = rec.strip().split()
    except:
        print('error: could not parse this line from file ' + file305)
        print('line ::', rec)
        print('please review the documentation')
        sys.exit(9)

    merit = merit.upper()
    if merit not in ['B+', 'B-', 'T+', 'T-']:
        print('error: genetic merit must be B+/B-/T+/T- or TA in file305')
        print('line ::', rec)
        sys.exit(9)
    bvta = 'BV' if merit[0] == 'B' else 'TA'
    sdg = float(siresd) * (2. if bvta == 'TA' else 1.)
    min_byear = int(byr1)
    min_hrd = int(min_hrd)
    min_dgh = int(min_dgh)
    warnings = []

    log.write('\n\n' + ('='*33) + (' Trait "%s" ' % trt) + ('='*33) + '\n')
    if args.verbose:
        log.write('\n' + '-'*79 + '\n')
        log.write('file305  trt byr1 herds daus type2x bvta    sdg\n')
        log.write('inputs   %s %s %5d%5d    %s    %s  %s\n' %
                  (trt, byr1, min_hrd, min_dgh, type2x, bvta, sdg))
        log.write('-'*79 + '\n')

    #--------------------------------------------------------------------------
    # read and store regular evaluation file
    data300, log = ibutils.read_file300(brd, pop, trt, 'file300' + _POPBRD,
                                        args, log)
    # read and store within year DD records for this trait
    data302 = {}; nr=0; kept=0
    for line in open('file302' + _POPBRD):
        rec, brd1, pop1, trt1, aid, year, nd, dd = line.strip().split()
        nr += 1
        if brd1 != brd or pop1 != pop or trt1 != trt:
            continue
        kept += 1
        if aid in data302:
            if year in data302[aid]:
                print('error: duplicate 302 records for trt=%s bull=%s year=%s'
                      % (trt, aid, year))
                sys.exit(8)
            data302[aid][year] = (int(nd), float(dd))
        else:
            data302[aid] = {year: (int(nd), float(dd))}
    if kept == 0:
        if args.verbose:
            log.write('warning: no DD records found for trait %s\n\n' % (trt))
        continue # skip this trait
    if args.verbose:
        log.write('read %d within year DD records from file302\n' % nr)
        log.write('kept %d DD records on %d bulls for trait %s\n' %
                  (kept, len(data302), trt))

    # check for bulls present in file302 but not file300
    missing = []
    for aid in data302:
       if aid not in data300:
           missing.append(aid)
    if (missing and args.verbose) or len(missing) > 10:
        n = len(missing)
        log.write('\nwarning: skipping %d bulls in file302 but not file300 %s' %
                  (n, '\n' if n <= 5 else '(first 5 ...)\n'))
        missing.sort()
        log.write(' '.join(missing[:min(5, n)]) + '\n\n')

    #==========================================================================
    # edit data and merge proofs
    y=[]; x=[]; bull=[]; by=[]; missing=[]; nb=0
    # process bulls on 300 file and look for stored records from 303 dataset
    # - edc/rel/"ebv" are zero if no record is found for
    for aid in data300:
        byr = byear.get(aid, '0000')
        # first edit: skip any bulls born before min_byear or after max_byear
        if int(byr) < min_byear:
            continue
        # fetch current data
        top, off, sta, nd, nh, edc, rel, ebv = data300[aid]
        # top = type of proof (11, 12, 21 etc)
        # off = officially publishable proof (Y/N)
        # sta = bull status
        # nh = n. herds
        # nd = n. daughters

        # additional data edits
        # -- is bull a domestic AI bull (or foreign bull for small pops)
        if not ((top<'20' and sta != '20') or (type2x == 'Y' and top == '21')):
            continue
        # -- does bull meet minimums for daus/herds/edc
        if nh < min_hrd or nd < min_dgh:
            continue

        # fetch and check within year DD data
        if aid not in data302:
            missing.append(aid)
            continue
        data = data302[aid]
        # -- skip first year(s) if n.daus < 10
        data_ok = []
        for year in sorted(data):
            (nj, ddj) = data[year]
            if not data_ok and nj < 10:
                continue
            data_ok.append((int(year), nj, ddj))
        # -- skip bulls with all daughters in a single qualifying year
        if len(data_ok) < 2:
            continue

        # okay, we finally have a qualifying bull
        if nb == 0 and args.mergefiles:
            # create merged dataset for additional checks with SAS, R, etc.
            fmerge = open(os.path.join(mergedir, trt + '.csv'), 'w')
        year0 = data_ok[0][0]
        for (year, nj, ddj) in data_ok:
            j = year - year0
            y.append(ddj)
            x.append(float(j))
            bull.append(nb)
            by.append(int(byr))
            if args.mergefiles:
                fmerge.write(' '.join((aid, byr, top, off, sta, 'f300')) +
                             (' %d %d %d %s %s' % (nd,nh,edc,rel,str(ebv))) +
                             (' f302 %d %d %0.3f' % (year, nj, ddj)) +
                             (' %d %d\n' % (nb, j)))
        nb += 1

    if args.mergefiles:
        fmerge.close()

    if (missing and args.verbose) or len(missing) > 5:
        n = len(missing)
        if n > 5 and n > int(0.01 * nb):
            # add warning tag in result file >1% of bulls are missing
            warnings.append('MISSING_BULLS')
            log.write('WARNING: MISSING BULLS! (%d for trait %s)' % (n, trt))
        log.write('\nwarning: found %d qualifying bulls in file300 that are not'
                  ' in file302 %s' % (n, '\n' if n <= 5 else '(first 5 ...)\n'))
        missing.sort()
        log.write(' '.join(missing[:min(5, n)]) + '\n')

    ny = len(y)
    if ny == 0:
        if args.verbose:
            log.write('warning: no merged records after edits for trait %s\n\n'\
                      % (trt))
        continue
    if not args.verbose: log.write('\n')

    log.write('\nSummary statistics for trait "%s" (%d records on %s bulls)\n'
              % (trt, ny, nb))
    log.write('-'*62 + '\n')
    log.write('Trait Variable          Mean       Std      Min      Max\n')
    log.write('-'*62 + '\n')
    fmt = '%s   %s %9.3f %9.3f %8.2f %8.2f\n'
    y = np.array(y)
    sdy = y.std(ddof=1)
    log.write(fmt % (trt, 'DD      (y)', y.mean(), sdy, y.min(), y.max()))
    x = np.array(x)
    sdx = x.std(ddof=1)
    log.write(fmt % (trt, 'YEAR J  (x)', x.mean(), sdx, x.min(), x.max()))
    z = np.array(by)
    log.write(fmt % (trt,'BIRTH YEAR ',z.mean(),z.std(ddof=1),z.min(),z.max()))
    log.write('-'*62 + '\n\n')

    #---------------------------------------------------------------------------
    # estimation of b, absorbing bull effect (incidence matrix Z)
    Zy = np.zeros(nb); ZZ = np.zeros(nb); Zx = np.zeros(nb)
    for ij in range(ny):
        i = bull[ij]
        Zy[i] += y[ij]
        Zx[i] += x[ij]
        ZZ[i] += 1.
    b = (sum(x*y) - sum(Zx*Zy/ZZ)) / (sum(x*x) - sum(Zx*Zx/ZZ))
    log.write('Estimate of b from model yij=BULLi+b*j+eij is %0.3f'
              ' for trait %s\n' % (b, trt))
    # -- biological test
    testval = abs(b) / sdg
    pass2 = 'PASS' if testval <= 0.01 else 'FAIL'
    log.write('Testval = abs(b)/SDg = %0.3f/%0.3f = %0.4f  ==> %s for %s\n' %
              (abs(b), sdg, abs(b) / sdg, pass2, trt))

    if first_rec:
        fout.write('rec brd pop tgrp trt testdate pass testval       b      SDg'
                   ' bv bulls   std_DD x byr1 mh md warnings\n')
        first_rec = False
    # prepare output record in parts
    p1 = '312 %s %s %s %s %s %s' % (brd, pop, trtg, trt, testdate, pass2)
    p2 = ' %7.3f %7.3f %8.3f %s %5d %8.3f' % (testval, b, sdg, bvta, nb, sdy)
    p3 = ' %s %s %2d %2d' % (type2x, byr1, min_hrd, min_dgh)
    warnings = ' ' + ','.join(warnings) if warnings else ' none'
    fout.write(p1 + p2 + p3 + warnings + '\n')
    log.write('\n')
fout.close()

ibutils.dated_msg(sys.argv[0]+': end', log=log)
log.close()
