#!/usr/local/bin/perl

# Example program 5.  Perform a common ancestor analysis of
# a tree, given a structure in the standard Newick format.
# Each leaf node specifies a nucleotide, or string of
# nucleotides.  All leaves must have the same length.

# Unfortunately, PERL does not have a simple and efficient
# way to represent trees.  In this example program, we
# store all of the elements in an array.  The root is at
# index 0.  For any node X in the tree, its left child
# resides at index 2X + 1, the right child at index 2X + 2.


# Define a way to mark internal nodes in the tree (as
# opposed to leaves) while the input is being read.  Thus
# when the tree is evaluated, we know that these nodes must
# be computed.  Define the root and current location.

$internal_node = "-";

$root = 0;
$pos  = 0;

# Store and echo the input.

$input = @ARGV[0];

printf( "Initial tree: $input\n\n" );

# Parse the input string, one character at a time.  The
# characters "(", ",", and ")" have special meaning to the
# parsing.  "(" indicates we must move down a level in the
# tree, to the next left child.  "," means to switch from
# the left child to the right child.  ")" means to move
# upward a level in the tree.  Any other character must be
# part of the label for the current node.

for ( $i = 0; $i < length( $input ); $i++ )
{
  $char = substr( $input, $i, 1 );

  if ( $char eq "(" )
  {
    $pos = $pos * 2 + 1;
  } # if
  elsif ( $char eq "," )
  {
    $pos++;
  } #elsif
  elsif ( $char eq ")" )
  {
    $pos = int( $pos / 2 ) - 1;
    $tree[$pos] = $internal_node;
  } # elsif
  else
  {
    $tree[$pos] = $tree[$pos] . $char;
    $leaf_len = length( $tree[$pos] );
  } # else
} # for i

# If the input has at least matching parentheses, we should
# end up back at the root node after parsing the input.

if ( $pos != $root )
{
  printf( "$0: invalid input string.\n\n" );
  exit( 1 );
} # if


# Compute the common ancestor, one character at a time.
# Thus we compute and print the common ancestor for the
# first character of all leaf nodes, the second character
# of all leaf nodes, etc.  If the result is a single
# letter, simply print it.  If it is a longer string, print
# them in parentheses.

printf( "Common ancestor: " );

for ( $i = 0; $i < $leaf_len; $i++ )
{
  $ancestor = evaluate( $root, $i );
  if ( length( $ancestor ) == 1 )
  {
    printf( "$ancestor" );
  } #if
  else
  {
    printf( "($ancestor)" );
  } # else
} # for i

printf( "\n\n" );


# A recursive function to evaluate a subtree, and return
# the common ancestor string.  The first input is a pointer
# to the current node being evaluated, the second is which
# input character is currently under consideration.

sub evaluate
{
  my( $ptr, $pos ) = @_;
  my( $left, $right, $eval_str );

  # If this is an internal node, compute it from its
  # children.  As stated in Chapter 5, if the intersection
  # of the child nodes is non-empty, it is used.  If that
  # intersection is empty, the union of the child nodes is
  # used.

  if ( $tree[$ptr] eq $internal_node )
  {
    $left  = evaluate(  left_child( $ptr ), $pos );
    $right = evaluate( right_child( $ptr ), $pos );

    $eval_str = intersection( $left, $right );

    if ( length( $eval_str ) == 0 )
    {
      $eval_str = union( $left, $right );
    } # if
  } # if
  else
  {
    # This is a leaf node: it simply evaluates to the
    # correct character position in this string.
    $eval_str = substr( $tree[$ptr], $pos, 1 );
  } # else

  return $eval_str;
} # evaluate


# Function to return the pointer to the left child of a
# given node.

sub left_child
{
  my( $ptr ) = @_;

  return $ptr * 2 + 1;
} # left_child


# Function to return the pointer to the right child of a
# given node.

sub right_child
{
  my( $ptr ) = @_;

  return $ptr * 2 + 2;
} # right_child


# Function to calculate the intersection of two strings.

sub intersection
{
  my( $a, $b ) = @_;
  my( $str, $i, $char );

  for ( $i = 0; $i < length( $a ); $i++ )
  {
    $char = substr( $a, $i, 1 );

    if ( index( $b, $char ) != -1 )
    {
      $str = $str . $char;
    } # if
  } # for i

  return $str;
} # intersection


# Function to calculate the union of two strings.

sub union
{
  my( $a, $b ) = @_;
  my( $str, $i, $char );

  $str = $a;

  for ( $i = 0; $i < length( $b ); $i++ )
  {
    $char = substr( $b, $i, 1 );

    if ( index( $a, $char ) == -1 )
    {
      $str = $str . $char;
    } # if
  } # for i

  return $str;
} # union

