#!/usr/bin/env python
from __future__ import print_function
from sys import argv
from math import sqrt
from collections import OrderedDict

from numpy import loadtxt, int32, concatenate


def read_cphlog(cphlogfilename):
    # Read the header and determine the pH value.
    pH = None
    for line in open(cphlogfilename, 'r'):
        if not line.startswith('#'):
          break
        tokens = line.lstrip('#').strip().split()
        try:
            pH = float(tokens[tokens.index('pH')+1])
        except ValueError:
            pass
    if pH is None:
        raise ValueError('No pH info in header of %s'%cphlogfilename)

    # Read the lambda_1 and lambda_2 columns (ignore the time stamp).
    lambda_ = loadtxt(cphlogfilename, int32, usecols=(1, 2))

    return lambda_, pH 

# Read and concatenate any number of files, sorting by the pH (use the pH value
# as a dictionary key). Since standard dict objects cannot be sorted, used the
# special OrderedDict class to sort the pH values in ascending order.
#
tmp = OrderedDict()
for cphlogfilename in argv[1:]:
    lambda_, pH = read_cphlog(cphlogfilename) 
    if pH not in tmp:
        tmp[pH] = lambda_
    else:
        tmp[pH] = concatenate((tmp[pH], lambda_))

pHs = tmp.keys()
pHs.sort()
cphlogs = OrderedDict()
for pH in pHs:
    cphlogs[pH] = tmp[pH]
del tmp    

# Each value in cphlogs is now an ndarray with shape (nsamples, 2), where
# nsamples is the total number of simulation cycles at that pH. The second axis
# of the array (axis=1) corresponds to the two protonation sites.
#
# Compute chi as the joint probability of _either_ site 1 or site 2 being
# occupied:
#
print('#  pH   frac')
for pH, lambda_ in cphlogs.iteritems():
    chi = lambda_[:, 0]*(1 - lambda_[:, 1]) + (1 - lambda_[:, 0])*lambda_[:, 1]
    prot_frac = chi.mean()
    prot_frac_err = sqrt(prot_frac*(1 - prot_frac) / chi.size)
    print('%5.2f %6.4f %6.4f'%(pH, prot_frac, prot_frac_err))