/***************************************************************************

  Mol_Volume: A program for calculating the macromolecular volume.

  VERSION: 1.0.

  AUTHOR:  Alexander Balaeff.

 (C) Copyright 2001 The Theoretical Biophysics Group, Beckman Institute, and
                    The Board of Trustees of the University of Illinois

 ***************************************************************************
  DESCRIPTION:

  This program calculates the volume of a macromolecule by somewhat akin to
  the Monte Carlo method, namely, by measuring how many vertices of a dence
  regular grid happen to be within the probe radius of the molecule's atoms.  
  The volume is then calculated as

         V = V_grid * N_near / N_total = N_near * V_per_node.

 ***************************************************************************
  SUGGESTED COMPILATION COMMAND LINE (FOR A DEC-ALPHA CC-COMPILER):

  cc -lm -fast -tune host -arch host -assume whole_program \
     -o mol_volume mol_volume.c

 ***************************************************************************
  COMMAND LINE FOR THE PROGRAM EXECUTION:

  mol_volume < config_file

  A single printout line will report the estimated molecular volume.

 ***************************************************************************
  A SAMPLE CONFIGURATION FILE:

  R_PROBE       2.0
  GRID_STEP     0.5
  PDB_NAME      lac_w_dna_types.pdb
  VDW_RAD_FILE  atom_rad.dat

 ***************************************************************************
  EXPLANATION OF THE CONFIGURATION PARAMETERS:

  R_PROBE      - the radius of the spherical "probe" (in Anstroms), added 
                 to the atoms' van der Waals radii in order to calculate
                 the amount of space that the biomolecule makes inaccessible.
		     Default: 0 A.
  GRID_STEP    - the size of one step of the 3D grid (in Anstroms) used to 
                 estimate the volume of the macromolecule.  Default: 0.25 A.
  PDB_NAME     - the name of the file, containing the coordinates of the 
                 atoms of the macromolecule in PDB format.  The last (11th) 
		     column of the PDB file, a string of at most 5 characters, 
		     should contain the chemical types of the atoms, e.g., 
		     such as defined by the CHARMM force field 
		     (http://www.pharmacy.umaryland.edu/~alex/research.html)
		     Such PDB file can be generated from a usual PDB/PSF couple
		     by using the following X-PLOR (http://atb.csb.yale.edu/xplor)
		     script:
		     
		             structure @lac_w_dna.psf end
				 coor @lac_w_dna.pdb
				 vector do (segid=chem) (all)
				 write coor output="lac_w_dna_types.pdb" end
				 stop

		     Default: the program exits if no PDB file name is supplied.

  VDW_RAD_FILE - the name of the data file, where van der Waals radii
                 are defined for the atoms of different chemical types. 
		     Such file can be constructed, for example, from the CHARMM 
		     force field files parallh22x.nuc, parallh22x.pro (in X-PLOR
		     format) by issuing the following commands in a UNIX shell:

                 gawk 'BEGIN{a=0.5*exp(log(2)/6)}; \
                       $1=="NONBONDED"{print $2,a*$4}' parallh22x.nuc > rad_nuc 
                 gawk 'BEGIN{a=0.5*exp(log(2)/6)}; \
		           $1=="NONBONDED"{print $2,a*$4}' parallh22x.pro > rad_pro 
                 cat rad_pro rad_nuc | sort \
                     | gawk 'NR==1{a=$0; print}; $0!=a{print; a=$0}' > atom_rad.dat

                 Default: van der Waals radii are set to 1.7 A for the atoms 
		     whose chemical types were not found in the VDW_RAD_FILE.  
		     If no VDW_RAD_FILE is specified, all atoms are assigned 
		     the radius of 1.7 A.

 ***************************************************************************/


/**********************************/
/*** The required C libraries:  ***/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>


/************************************/
/*** The compilation parameters:  ***/
/***                              ***/
#define MAXATOMS 50000  /*** the largest allowed number of macromolecule atoms ***/
#define MAXTYPES  1000  /*** the largest allowed number of atom types ***/
                        /*** in the database of van der Waals radii ***/

/*** If any of the above parameters is insufficient for your system, ***/
/*** recompile the program with that number accordingly changed.     ***/


/**********************************/
/*** The program return codes:  ***/
/***                            ***/
#define ERR_NO_ERROR             0
#define ERR_FAILED_OUTPUT       -2
#define ERR_BAD_PDBFILE         -3
#define ERR_BAD_VDWFILE         -4
#define ERR_LONG_VDWFILE        -5
#define ERR_UNKNOWN_ATOM_TYPE  -10
#define ERR_INSUF_MEM_SQRTS    -31
#define ERR_INSUF_MEM_EXCLMAP  -32
#define ERR_INSUF_MEM_ENERGY   -33


/*******************************************/
/***  The list of auxilliary subroutines ***/
/*** reading the configuration file.     ***/
/***                                     ***/
int read_config_line_int(char *, const char *, long int *);
int read_config_line_double(char *, const char *, double *);
int read_config_line_string(char *, const char *, char *);


/**********************************************************/
/******************** THE MAIN MODULE  ********************/
/**********************************************************/

int main(int argc,
	   char *argv[])
{

/************************************/
/***  THE LIST OF THE VARIABLES:  ***/
/************************************/

/** The return code: **/
  int return_code=ERR_NO_ERROR;

/*****************************************/
/*** The van der Waals radii database. ***/
/***                                   ***/
/*** The database file and its name:   ***/
  FILE *vdw_file;
  char vdw_name[100]="\0";
/*** The database existence flag: ***/
  int vdw_flag=0;
/*** Atom chemical type names:  ***/
  char vdw_type[MAXTYPES][5];
/*** The radii:  ***/
  double vdw_rad[MAXTYPES];
/*** The number of the chemical types:  ***/
  int N_vdw=0;
/*** The default radius:  ***/
  double def_vdw_rad = 1.7;

/***************************/
/*** The macromolecule.  ***/
/***                     ***/
/*** The PDB file and its name:  ***/
  FILE *pdb_file;
  char pdb_name[100]="\0";
/*** the number of atoms:  ***/
  long int N_atoms;
/*** atomic coordinates:  ***/
  static double x_at[MAXATOMS], y_at[MAXATOMS], z_at[MAXATOMS];
/*** atomic radii:  ***/
  static double r_at[MAXATOMS];
/*** the largest radius:  ***/
  double r_max;
/*** the square of a current atom's radius:  ***/
  double excl_rad_sq;
/*** chemical type of a current atom:  ***/
  char atom_type[5];
/*** the dimensions of the macromolecule:  ***/
  double x_max=0.0, y_max=0.0, z_max=0.0;
  double x_min=0.0, y_min=0.0, z_min=0.0;
/*** the probe radius: ***/
  double r_probe=0.0;
/*** the volume of the macromolecule: ***/
  double Vol;

/************************/
/*** Grid parameters. ***/
/***                  ***/
/*** grid step size:  ***/
  double delta=0.25;
/*** grid margin around the macromolecule: ***/
  double border_width;
/*** the dimensions of the grid box:  ***/
  long int N_box_0, N_box_1, N_box_2;
/*** the total number of nodes in the grid:  ***/
  long int N_box_tot;
/*** the proximity map for the grid, where 1's mark the grid nodes  ***/
/*** found to be closer than van der Waals plus the probe radius to ***/ 
/*** an atom of the macromolecule.                                  ***/
  unsigned char *I_close;
/*** the number of grid nodes in the proximity of the macromolecule:  ***/
  long int N_Vol;
/*** the boundaries of an atom vicinity on the grid:  ***/
  long int i_max, j_max, k_max;
  long int i_min, j_min, k_min;
/*** coordinate counters for that vicinity:  ***/
  long int ii, jj, kk;
/*** the auxilliary variables used to build the 1D index from the three 3D counters:  ***/
  long int ind, ind0, ind1;
/*** distances from atoms to the grid points:  ***/
  double d0, d1, d2;

/******************************************************/
/*** Loop counters and auxiliary (dummy) variables. ***/
/***                                                ***/
  long int i, j, k;
  char s[100], c_dum[10];
  double f_dum;
  long int i_dum;


/********************************/
/*** THE BODY OF THE PROGRAM. ***/
/********************************/

/************************************************/
/*** Readout of the configuration parameters. ***/
/************************************************/
  while( gets(s) ) {
    i_dum = 0;
    i_dum += read_config_line_double(s,"R_PROBE",&r_probe);
    i_dum += read_config_line_double(s,"GRID_STEP",&delta);
    i_dum += read_config_line_string(s,"VDW_RAD_FILE",vdw_name);
    i_dum += read_config_line_string(s,"PDB_NAME",pdb_name);
  }  


/*******************************************/
/*** Readout of VDW atom radii database. ***/
/*******************************************/

  vdw_flag = strlen(vdw_name);

  /** Check whether the database was specified and is existing **/
  if( !vdw_flag ) {
    printf("WARNING: the van der Waals radii data file is undefined.\n");
    printf("The van der Waals radii of all the atoms will be set to the default value of %3.1f A\n",
	     def_vdw_rad);
  }
  else {
    if( (vdw_file=fopen(vdw_name,"rt")) == NULL ) {
	printf("WARNING: the van der Waals radii data file %s can not be opened for reading!\n", 
		 vdw_name);
	printf("The van der Waals radii of all the atoms will be set to the default value of %3.1f A\n",
	     def_vdw_rad);
	vdw_flag = 0;
	return_code = ERR_BAD_VDWFILE;
    }
  }

  /** If everything is OK read the database line by line **/
  if( vdw_flag ) {
    i = 0;
    while( fgets(s,100,vdw_file) ) {
	sscanf(s,"%s %lf", vdw_type[i], &vdw_rad[i]);
	i++;
	if( i == MAXTYPES ) {
	  printf("WARNING: The number of entries in the van der Waals radii database %s\n", vdw_name);
	  printf("exceeds the allocated memory. Only the first %d positions were read.\n", MAXTYPES);
	  printf("You may want to recompile the program with a larger MAXTYPES parameter.\n\n");
	  return_code = ERR_LONG_VDWFILE;
	  break;
	}
    }
    N_vdw = i;
    
    fclose(vdw_file);
  }
  

/********************************************/
/*** Readout of the macromolecular atoms. ***/
/********************************************/
  if( (pdb_file=fopen(pdb_name,"rt")) == NULL ) {
    printf("ERROR: the PDB file %s can not be opened for reading!\n", pdb_name);
    return ERR_BAD_PDBFILE;
  }

  i = 0;
  while( fgets(s,100,pdb_file) ) {
    if( strncmp(s,"ATOM",4) != 0 ) continue; 

    sscanf(s+30,"%8lf%8lf%8lf %lf %lf %5s",
	     x_at+i, y_at+i, z_at+i, &f_dum, &f_dum, atom_type);
    
    if( vdw_flag ) {
	/** detect the atom chemical type **/
	for( j=0; j<N_vdw; j++) 
	  if( strcmp(vdw_type[j],atom_type) == 0 ) break;
	if( j < N_vdw ) 
	  r_at[i] = (vdw_rad[j] + r_probe) / delta;
	else {
	  r_at[i] = (def_vdw_rad + r_probe) / delta;	  

	  printf("WARNING: the %i-th atom of the pdb file %s has an unknown chemical type %s.\n",
		   i+1, pdb_name, atom_type);
	  printf("The van der Waals radius of this atom is set to the default value of %3.1f A\n",
		   def_vdw_rad);
	  return_code = ERR_UNKNOWN_ATOM_TYPE;
	}
    }
    else
	r_at[i] = (def_vdw_rad + r_probe) / delta;
	
    if( i==0 || r_max < r_at[i] ) { r_max=r_at[i]; }

    x_at[i] /= delta;
    y_at[i] /= delta;
    z_at[i] /= delta;
    
    /** update the size of the system **/
    if( i==0 || x_max < x_at[i] ) { x_max=x_at[i]; }
    if( i==0 || x_min > x_at[i] ) { x_min=x_at[i]; }
    if( i==0 || y_max < y_at[i] ) { y_max=y_at[i]; }
    if( i==0 || y_min > y_at[i] ) { y_min=y_at[i]; }
    if( i==0 || z_max < z_at[i] ) { z_max=z_at[i]; }
    if( i==0 || z_min > z_at[i] ) { z_min=z_at[i]; }
    i++;
  }

  fclose(pdb_file);
  N_atoms = i;

/*  printf("The macromolecular dimensions are: \n"); */
/*  printf("%f -- %f\n", x_min*delta, x_max*delta); */
/*  printf("%f -- %f\n", y_min*delta, y_max*delta); */
/*  printf("%f -- %f\n", z_min*delta, z_max*delta); */
/*  printf("\n"); */


/**********************************/
/*** The setup of the grid box. ***/
/**********************************/

/** Finding the size of the grid box. **/
  border_width = r_max + 1.0;
  N_box_0 = (int)( x_max - x_min + 2.*border_width ) + 1 ;
  N_box_1 = (int)( y_max - y_min + 2.*border_width ) + 1 ;
  N_box_2 = (int)( z_max - z_min + 2.*border_width ) + 1 ;
  N_box_tot = N_box_0*N_box_1*N_box_2;
/*  printf("The grid dimensions are:\n"); */
/*  printf("    %ld  X  %ld  X  %ld  =  %ld nodes\n\n",N_box_0,N_box_1,N_box_2,N_box_tot); */

/** Moving the atoms into the grid box. **/
  for( i=0; i<N_atoms; i++) {
    x_at[i] -= x_min - border_width;
    y_at[i] -= y_min - border_width;
    z_at[i] -= z_min - border_width;
  }

/** Allocating and initializing the proximity map. **/
  if( (I_close=calloc( N_box_tot,1)) == NULL ) {
    printf("Not enough memory to allocate the proximity map (%ld Mb required).\n", 
	     N_box_tot/(1024*1024));
    printf("Try to re-run the program with a larger grid step (currently, %f A).\n", delta);
    return ERR_INSUF_MEM_EXCLMAP;
  }

  for( k=0; k<N_box_tot; k++ )  I_close[k] = 0;


/*******************************************************************/
/*** Computing the volume of the molecule by finding how many of ***/
/*** the grid nodes fall too close to the atoms of the molecule. ***/
/*******************************************************************/

/** Step 1: for each atom, mark the grid nodes which are located **/
/**         closer to this atom than r_at[i]: the probe radius   **/
/**         plus the atom's van der Waals radius.                **/
  for( i=0; i<N_atoms; i++) {
    excl_rad_sq = r_at[i]*r_at[i];

    /** finding the boundaries of the atom vicinity on the grid **/
    i_max = ceil(x_at[i] + r_at[i]);  
    if( i_max >= N_box_0 ) i_max = N_box_0 - 1;
    i_min = floor(x_at[i] - r_at[i]);  
    if( i_min < 0 ) i_min = 0;

    j_max = ceil(y_at[i] + r_at[i]);  
    if( j_max >= N_box_1 ) j_max = N_box_1 - 1;
    j_min = floor(y_at[i] - r_at[i]);  
    if( j_min < 0 ) j_min = 0;

    k_max = ceil(z_at[i] + r_at[i]);  
    if( k_max >= N_box_2 ) k_max = N_box_2 - 1;
    k_min = floor(z_at[i] - r_at[i]);  
    if( k_min < 0 ) k_min = 0;

    /** going over the points in this vicinity one by one,   **/
    /** determining whether they are closer than the allowed **/
    /** distance r_at[i] to the i-th atom                    **/
    ind0 = i_min*N_box_1;
    for( ii = i_min; ii <= i_max; ii++, ind0 += N_box_1 ) {	
	d0 = (double)ii - x_at[i];  /** pre-computed distances   **/
	d0 *= d0;                   /** speed up the calculation **/

	ind1 = (ind0 + j_min)*N_box_2;
	for( jj = j_min; jj <= j_max; jj++, ind1 += N_box_2 ) {	  
	  d1 = (double)jj - y_at[i];  /** pre-computed distances   **/
	  d1 *= d1;                   /** speed up the calculation **/

	  ind = ind1 + k_min;
	  for( kk=k_min; kk<=k_max; kk++, ind++ ) {

	    /** check only the unmarked nodes **/
	    if( ! I_close[ind] ) {
		d2 = (double)kk - z_at[i];
		d2 *= d2;
		if( d0 + d1 + d2 <= excl_rad_sq )	
		  I_close[ind] = 1;  /** mark the newly found close node **/
	    }
	    
	  }}}    
  }

/** Step 2: count the number of marked nodes. **/
  N_Vol = 0;
  for( k=0; k<N_box_tot; k++ )  if( I_close[k] )  N_Vol++;


/** Step 3: the molecular volume approximately equals to the number of **/ 
/**         close nodes multiplied by the average volume per node.     **/
  Vol = ((double)N_Vol)*delta*delta*delta;


/***************************/
/*** The final printout. ***/
/***************************/

  printf(" The volume of the macromolecule %s equals approximately to %5.2e A^3.\n",
	   pdb_name, Vol);

  return return_code;

}


/************************/
/*** THE SUBROUTINES. ***/
/************************/

/*****************************************************************/
/*** The following subroutines read configuration parameters   ***/
/*** from the given configuration line, if the required format ***/
/***       KEYWORD  PARAMETER                                  ***/
/*** is matched.                                               ***/
/*****************************************************************/

int read_config_line_int(char *line, const char *keyword, long int *num)
{
  char *sub, read_format[100];

  if( (sub=strstr(line,keyword)) == NULL )  return 0;
  
  sprintf(read_format,"%s %%ld",keyword);

  if( sscanf(sub,read_format,num) != 1 ) {
    printf("Wrong configuration line:\n%s\n",line);
    return 0;
  }
  else  return 1;

}

int read_config_line_double(char *line, const char *keyword, double *val)
{
  char *sub, read_format[100];

  if( (sub=strstr(line,keyword)) == NULL )  return 0;

  sprintf(read_format,"%s %%lf",keyword);

  if( sscanf(sub,read_format,val) != 1 ) {
    printf("Wrong configuration line:\n%s\n",line);
    return 0;
  }
  else  return 1;

}

int read_config_line_string(char *line, const char *keyword, char *word)
{
  char *sub, read_format[100];

  if( (sub=strstr(line,keyword)) == NULL )  return 0;
  
  sprintf(read_format,"%s %%s",keyword);

  if( sscanf(sub,read_format,word) != 1 ) {
    printf("Wrong configuration line:\n%s\n",line);
    return 0;
  }
  else  return 1;

}

/**********************************************************************************/
