# Sridhar Ramachandran
# sramacha@cs.wright.edu
# 05/22/2005
# Murugan Samarpanam
# This scrip is to primarily investigate the Censor files and the Alu-in-Alu families. 


#!/bin/perl

opendir ( Censor, '../censor');
# Opens the Censor Directory to start examining the chromosome folders
open (Aludet , ">aluinaludet.txt");
# Opens a text file that Contains the details about the Alu families from the chrom files.
#print "Hello Sridhar \n" ;
	@Censorfolders = readdir(Censor);
	# collects a list of all the folders in the /censor  directory
	foreach $Censorfolders (@Censorfolders) {
		# This loop enters every folder in the Censor directory
		opendir (Cfolder,"../censor/$Censorfolders"); 
		# This opens the folder that the loop is currently looking at
			@Cfoldcontents = readdir(Cfolder);
			# collects a list of all the files in the folder 
			foreach $Cfoldcontents (@Cfoldcontents) {
			# This loop enters every file in the Cfolder
			 next if ($Cfoldcontents =~/^\./);
			 # This avoids the . & .. directories/files
			 	open (Chromfile, "../censor/$Censorfolders/$Cfoldcontents");
				# opens the chromosome files that the loop is looking at.
			 		@Censorinput = <Chromfile>;
					chomp(@Censorinput);
					# traverses line by line the contents of the Censor output file
					$Alufound = 0;
					# A semaphore to help extract the alu sequences
					$seqcount = 0;
					# A counter to differentiate the progenitor from the actual
					foreach $Censorinput (@Censorinput){
					# This loop checks every line in the censor output file
					
						#--------------------------------------------------------------------
						# Only pics up the line that starts with "ALU"
						# This subroutine is to gather the details for Contig visualization
						
						if ($Censorinput =~ /^ALU/) {
			 				#@GetAluparas = split (/ /,$Censorinput);
							# to get the length of the sequence from censor output itself
							#print "\nThis is the length from Censor :";
							#print "$GetAluparas[1]";
							#print "\n";
							#print "$GetAluparas[9]";
							#print "\n";
							# NEEDS TO BE TWEAKED AND FIXED TO WORK PERFECTLY
						   #if ($GetAluparas[9]>=450) {
							print Aludet "$Cfoldcontents";
							# These IF statements are to align the output ...
							# for/to facilitate data collection & analysis  
							if (length($Cfoldcontents) == 10) { print Aludet "        ";};
							if (length($Cfoldcontents) == 11) { print Aludet "       ";};
							if (length($Cfoldcontents) == 12) { print Aludet "      ";};
							if (length($Cfoldcontents) == 13) { print Aludet "     ";};
							if (length($Cfoldcontents) == 14) { print Aludet "    ";};
							if (length($Cfoldcontents) == 15) { print Aludet "   ";};
							if (length($Cfoldcontents) == 16) { print Aludet "  ";};	
							if (length($Cfoldcontents) == 17) { print Aludet " ";};
						   print Aludet "\t$Censorinput\n";
						   #} #End of if length is less than 450bp
						}# End of If  $ Censorinput matches to ALU statement
						#---------------------------------------------------------------------
						
						
						#---------------------------------------------------------------------
						# Makes the Semaphore'1' whenever an Alu is detected
						if ($Censorinput =~ /^ALU/) {
							$Alufound = 1;
							# The Alufound Semaphore is initialized
							
							#*******************************************
							# Variables initialized for a new ALU
							$nosofmatches = 0;
							$nosofmismatches = 0;
							$nosoftransitions = 0;
							$nosoftransversions = 0;
							$nosofterminalgaps = 0;
							$nosofinternalgaps = 0;
							$nosofinsertions = 0;
							$nosofdeletions = 0;
							$a2gap = 0;
							$c2gap = 0;
							$g2gap = 0;
							$t2gap = 0;
							$gap2a = 0;
							$gap2c = 0;
							$gap2g = 0;
							$gap2t = 0;													
							$Alulngth = 0;
							#*******************************************
							
							@GetAluName = split (/ /,$Censorinput);
							# Gets the Alu name by splitting the Alu line
							#print $GetAluName[0];
							#print "\n";
						}# End of $Censorinput =~ /^ALU/ IF statement
						#----------------------------------------------------------------------
						
						#----------------------------------------------------------------------
						# Initiates data collection when alu detected 
						if ($Alufound == 1) {
						
							if ($Censorinput =~ m/transitions/) {
							# Disables the semaphore when ever transitions is found 
								#print "$GetAluName[0] end found \n" ;
								$Alufound = 0;
								#semaphore diabled
								
								#increment the nos of terminal gaps if at the End terminal
								if ($terminalgapfound == 1 )
								{
													
									$nosofterminalgaps = $nosofterminalgaps + 1;
									#increment the number of terminal gaps
									$nosofinternalgaps = $nosofinternalgaps - 1;
									# the last one calculated internal gap was a terminal one
									$terminalgapfound = 0;
									# reset it anyway		
											
								} # End of If  $terminalgapfound = 1 statement
																				
													
								if ($Alulngth >= 450) {
								#*******************************************
								open (alucal , ">>$GetAluName[0].txt");
									#print alucal $Cfoldcontents;
									#print alucal ",";
									print alucal $nosofmatches;
									print alucal ",";
									print alucal $nosofmismatches;
									print alucal ",";
									print alucal $nosoftransitions;
									print alucal ",";
									print alucal $nosoftransversions;
									print alucal ",";
									print alucal $nosofterminalgaps;
									print alucal ",";
									print alucal $nosofinternalgaps;
									print alucal ",";
									print alucal $nosofinsertions;
									print alucal ",";
									print alucal $nosofdeletions;
									print alucal ",";
									print alucal $a2gap;
									print alucal ",";
									print alucal $c2gap;
									print alucal ",";
									print alucal $g2gap;
									print alucal ",";
									print alucal $t2gap;
									print alucal ",";
									print alucal $gap2a;
									print alucal ",";
									print alucal $gap2c;
									print alucal ",";
									print alucal $gap2g;
									print alucal ",";
									print alucal $gap2t;
									print alucal ",";
									print alucal $Alulngth;
									print alucal "\n";
																					
								
								close (alucal);
								#*******************************************
								} # End of $Alulngth >= 450
							}# End of ($Censorinput =~ m/transitions/) IF statement 
							else {
							#continues when alu found and end not encountered
								if ($Censorinput =~ /^ALU/) {#do not rip the first line
								} # End of ($Censorinput =~ /^ALU/) IF statement
								
								else 
								{   
								
									if ($Censorinput =~ m/(G|C|A|-|T)/) {
										
										if ($seqcount == 0) {
										
											$Censorinput =~ s/^\s+//;
											# Trims the White spaces from the start
											$Censorinput =~ s/\s+$//;
											# Trims the White spaces from the end
											@TopSequence = split (//,$Censorinput);
											#splits the individual nucleotides
											#print "\n This is the Top Sequence\n";
											#print @TopSequence;
											$topseqlng = length($Censorinput);							
											$seqcount = 1;
											} # End of $seqcount == 0 IF statement
										else {	
											$Censorinput =~ s/^\s+//;
											# Trims the White spaces from the start
											$Censorinput =~ s/\s+$//;
											# Trims the White spaces from the end
											@BottomSequence = split (//,$Censorinput);
											#splits the individual nucleotides
																															
																																
											
											#****************************************
											# At this stage we have all the nucleotides
											# ready for comparison and calculation
											$whilecount = 0;
											# A count to go through the While loop
											$terminalgapfound = 0;
											#This is not the last seq line so reset/set $terminalgapfound = 0;
											
											while ( $whilecount < $topseqlng) {
											
												#******* Counting # of MATCHES
												if (($TopSequence[$whilecount] eq $BottomSequence[$whilecount]) && ($TopSequence[$whilecount] ne '-') ) 
												{
													$nosofmatches = $nosofmatches + 1;
													#increment the number of matches
												} # End of IF eq statement 
												
												#******* Counting # of MISMATCHES , Transitions and transversitions
												if (($TopSequence[$whilecount] ne $BottomSequence[$whilecount]) &&  ($TopSequence[$whilecount] ne '-') && ($BottomSequence[$whilecount] ne '-') ) 
												{
													$nosofmismatches = $nosofmismatches + 1;
													#increment the number of mismatches
													
													# Calculating transitions and transversitions
													if ($TopSequence[$whilecount] eq 'G'){
														if ($BottomSequence[$whilecount] eq 'A') 
															{ $nosoftransitions = $nosoftransitions + 1; 
															}
														if ($BottomSequence[$whilecount] eq 'T') 
															{ $nosoftransversions = $nosoftransversions + 1; 
															}
														if ($BottomSequence[$whilecount] eq 'C') 
															{ $nosoftransversions = $nosoftransversions + 1; 
															}
													} #End of $TopSequence[$whilecount] eq 'G'
													
													if ($TopSequence[$whilecount] eq 'A'){
														if ($BottomSequence[$whilecount] eq 'G') 
															{ $nosoftransitions = $nosoftransitions + 1; 
															}
														if ($BottomSequence[$whilecount] eq 'T') 
															{ $nosoftransversions = $nosoftransversions + 1; 
															}
														if ($BottomSequence[$whilecount] eq 'C') 
															{ $nosoftransversions = $nosoftransversions + 1; 
															}													
													} #End of $TopSequence[$whilecount] eq 'A'
													
													if ($TopSequence[$whilecount] eq 'C'){
														if ($BottomSequence[$whilecount] eq 'T') 
															{ $nosoftransitions = $nosoftransitions + 1; 
															}
														if ($BottomSequence[$whilecount] eq 'A') 
															{ $nosoftransversions = $nosoftransversions + 1; 
															}
														if ($BottomSequence[$whilecount] eq 'G') 
															{ $nosoftransversions = $nosoftransversions + 1; 
															}	
													}#End of $TopSequence[$whilecount] eq 'C'
													
													if ($TopSequence[$whilecount] eq 'T'){
														if ($BottomSequence[$whilecount] eq 'C') 
															{ $nosoftransitions = $nosoftransitions + 1; 
															}
														if ($BottomSequence[$whilecount] eq 'A') 
															{ $nosoftransversions = $nosoftransversions + 1; 
															}
														if ($BottomSequence[$whilecount] eq 'G') 
															{ $nosoftransversions = $nosoftransversions + 1; 
															}	
													}#End of $TopSequence[$whilecount] eq 'T'
													
													
												} # End of IF ne statement 
											
												#******* Counting # Terminal gaps and Internal gaps**********
												if ($Alulngth == 0) # Start of the Alu sequence
												{
													# This If statement is to only look at the first line from every alu 
													
													if ( ($whilecount == 0) && (($TopSequence[0] eq '-') || ( $BottomSequence[0] eq '-' )))
													{
													$nosofterminalgaps = $nosofterminalgaps + 1;
													#increment the number of terminal gaps
													
													
													} # End of If $whilecount == 0 statement
													
													
													if ( ($whilecount != 0) && (($TopSequence[$whilecount] eq '-') || ( $BottomSequence[$whilecount] eq '-' )) )
													{
													
													$nosofinternalgaps = $nosofinternalgaps + 1;
													#print "\n Hey! Now I am here \n";
													#increment the number of internal gaps
													
													} # End of If if  $whilecount != 0 statement
													
													
												} # End of IF eq statement 
												
												
												else {if ($Alulngth != 0) 
												
												{
													
																										
													if ( ($TopSequence[$topseqlng-1] eq '-') || ( $BottomSequence[$topseqlng-1] eq '-' ) )
													{
													
													
													$terminalgapfound = 1;
													#print "\n Terminal Gap Found :";
													#print $terminalgapfound;
													#print "\n";
													#keep a track of the fact that a terminal gap exists at the end of the sequence
													
													
													} # End of If if  $TopSequence[$whilecount-1] eq '-' statement
													
													
													if (($TopSequence[$whilecount] eq '-') || ( $BottomSequence[$whilecount] eq '-' ))
													{
													
													$nosofinternalgaps = $nosofinternalgaps + 1;
													#increment the number of internal gaps
													
													
													} # End of if $TopSequence[$whilecount] eq '-' statement
													
													
													
													
													#$nosofinternalgaps = $nosofinternalgaps + 1;
													#increment the number of terminal gaps
												} # End of IF eq statement 
												}
												
											
												#******** Counting # of insertions & RATE OF INSERTION******************											
												if ( ($TopSequence[$whilecount] eq '-') && ($BottomSequence[$whilecount] ne '-') )
												{
													$nosofinsertions = $nosofinsertions + 1;
													#increment the number of insertions
													# Modifications can be made to count the length of the insertions
													
													if ($BottomSequence[$whilecount] eq 'A') 
													{
														$gap2a = $gap2a + 1;
													} # end of gap2a
													
													if ($BottomSequence[$whilecount] eq 'C') 
													{
														$gap2c = $gap2c + 1;
													} # end of gap2c
													
													if ($BottomSequence[$whilecount] eq 'G') 
													{
														$gap2g = $gap2g + 1;
													} # end of gap2g
													
													if ($BottomSequence[$whilecount] eq 'T') 
													{
														$gap2t = $gap2t + 1;
													} # end of gap2t
													
													
												} # End of IF eq statement $TopSequence[$whilecount] eq '-'
												
												#******** Counting # of deletions & RATE OF DELETION******************											
												if ( ($TopSequence[$whilecount] ne '-') && ($BottomSequence[$whilecount] eq '-') )
												{
													$nosofdeletions = $nosofdeletions + 1;
													#increment the number of deletions
													#Modifications can be made to count the length of the deletions
													
													if ($TopSequence[$whilecount] eq 'A') 
													{
														$a2gap = $a2gap + 1;
													} # end of a2gap
													
													if ($TopSequence[$whilecount] eq 'C') 
													{
														$c2gap = $c2gap + 1;
													} # end of c2gap
													
													if ($TopSequence[$whilecount] eq 'G') 
													{
														$g2gap = $g2gap + 1;
													} # end of g2gap
													
													if ($TopSequence[$whilecount] eq 'T') 
													{
														$t2gap = $t2gap + 1;
													} # end of t2gap
													
													
												} # End of IF eq statement $TopSequence[$whilecount] ne '-'
												
				
																					
											$whilecount = $whilecount + 1;
											# Increment the loop variable to continue looping
											
											} #End of While loop
											
											$Alulngth = $Alulngth + $topseqlng;
											#print "\n $Alulngth \n";
											# keeping a track on the ALU length																							
											$seqcount = 0;
											# A counter to differentiate the progenitor from the actual is RESET
											
											} ## End of $seqcount == 0 ELSE statement
											
									} # End of $Censorinput =~ m/(G|C|A|-|T)/  IF statement
								
								
													
								} # End of ($Censorinput =~ /^ALU/) ELSE stm.
							
							} # End of ($Censorinput =~ m/transitions/) ELSE statement
							
						} # End of ($Alufound == 1) IF statement
						#----------------------------------------------------------------------
						
						
						
						
			 		}# End of foreach @Censorinput
					
			 	close (Chromfile);
				#Closes the Chromosome file so that the a new one can be opened.
			 	#print "\t";
			 	#print "$Cfoldcontents";}
				#print "\n $array is complete \n\n\n\n";}
				
			} # End of foreach @Cfoldcontents	
		closedir(Cfolder);
		# Closed the Folder in the Censor directory so that a new folder can be opened.
	} # End of foreach @Censorfolders
#print Aludet "\n";
close(Aludet);
closedir(Censor);
