Commits

Kunjan Kshetri committed 25e8ab5

init repo

  • Participants

Comments (0)

Files changed (4)

File LPZ-compression.pl

+# Lempel Ziev
+use POSIX qw(ceil floor);
+
+# ------------------------------------------------------------------------------
+$string = "kunjan";
+@single_character_array = split('', $string);
+# ------------------------------------------------------------------------------
+#   Compression Algorithm
+#1. Convert it to ASCII---done
+#2. Convert to Binary ---done
+#3  Append-. Make Unique substrings
+#4. Convert to the (position of old guys ,new guy) (back ref ,   suffix)
+#5. Encode  backref(position) as binary
+#6. Combine the entire thing into one big block
+
+# ------------------------------------------------------------------------------
+
+# finds the eight bit of the given character
+sub get_eight_bin{
+ $result = $_[0];
+ $bin = unpack("B*", pack("c", ord($result)));
+ # $bin =~ s/^0+(?=\d)//;
+ # $bin =~ s/\d\d\d\d\d\d\d\d\$//;
+ #return substr $bin,24,32;
+ return $bin;
+}
+
+
+# ------------------------------------------------------------------------------
+
+
+# Represents the Hash to make sure we are getting only unique keys
+%key_blockposition = ();
+# Store the keys in an array
+@key_blocks;
+%suffix_position = ();
+# ------------------------------------------------------------------------------
+
+
+
+# Finds the unique substring for a given string
+sub unique_substring{
+
+    my $string = $_[0];
+    my $length =  length $string;
+    $block_position = 0;
+
+    for (my $i =0 ;$i < $length ;$i++){
+
+	# get the first character
+        my $ch = substr $string,$i,1;
+	#print "now i position is ". $i;
+	#print "\n";
+   
+	if (not (exists $key_blockposition{$ch})){
+	    
+	    #put the block and the block position in the hash
+       	    print "putting in " . $ch ; print "\n";
+	    $key_blockposition{$ch} =  $block_position;
+	    push(@key_blocks,$ch);
+            $block_position =  $block_position + 1;
+	}
+	else {
+
+	     for ($j = ($i +1) ; $j < length $string ;$j++){
+		 
+		# take one more substring
+		$new_substring = substr $string, $i, ($j-$i+1);
+
+
+		# the current string is not in the hash.
+		if (not (exists $key_blockposition{$new_substring})){		  
+
+		    # the substring and the position is put in the hash
+		    $key_blockposition{$new_substring} = $block_position;
+		    
+		    # the position now starts from the end of this new substring
+		    $i = $j;
+		  
+		    $block_position = $block_position + 1;
+
+		    #break out of the loop
+		    last;
+		}
+		print "$new_substring already exists in the hash \n";
+	     }
+	 # push the new substring. if we are at the end then break off.
+	 print "putting in" . $new_substring; print "\n";
+	 push(@key_blocks,$new_substring);
+         if ($j == length $string) { last;}
+	}
+
+    }
+    
+}
+
+# ------------------------------------------------------------------------------
+
+
+# get the binary representation and append that to the string
+foreach my $character(@single_character_array){
+    $append_binary .= get_eight_bin($character);
+}
+
+unique_substring($append_binary);
+# ------------------------------------------------------------------------------
+
+sub find_offset{
+
+    my $key = $_[0];
+    $current_position =$_[1];
+    $prefix_length =  length($key)-1;
+
+    $prefix= substr $key,0, $prefix_length;
+
+    if($prefix eq ''){
+#      	print "NO PREFIX ";
+	$offset = 0;
+    }
+    else{
+	my($prefix_position) = grep { $key_blocks[$_] eq $prefix } 0..$#key_blocks;
+	 $offset = $current_position - $prefix_position;
+	print "THE PREFIX IS $prefix at $prefix_position current 
+               $key is at $current_position offset:$offset\n ";
+    } 
+
+    return $offset;
+
+}
+
+# ------------------------------------------------------------------------------
+
+
+# if we cannot find the encoded_offset, which is when it is the first character
+# we return -1 else we return encoded offset.
+sub encode_offset{
+
+    my $offset = $_[0];
+    my $current_position   = $_[1];
+    
+
+    # encode $offset in $key_blockposition{$key}
+    $position = $current_position + 1;
+    $log_position = log ($position) / log 2;
+    
+    $number_of_bits = ceil($log_position);
+    
+    if ($number_of_bits == 0){
+	$substr_bin = -1;
+    }
+    else{
+    print "NUMBER OF BITS to encode $offset is -> $number_of_bits" . "\n";
+    my $str = unpack("B32", pack("N", $offset));
+    # $bin = unpack("B32", pack("N", ord($offset)));
+    $substr_bin = substr $str,(32 -  $number_of_bits),32;
+    }
+    print "binary represenation is $substr_bin" . "\n";
+    return $substr_bin;
+}
+
+
+# ------------------------------------------------------------------------------
+
+# Iterate over all the keys and find the encoded offset
+sub iterate {
+
+    $compressed_string = "";
+    $length_keys = scalar @key_blocks;
+
+    for(my $i = 0 ; $i < $length_keys ; $i++){
+
+
+	my $key = $key_blocks[$i];
+	print "********************************\n";
+	print "Processing $key at $i position\n ";
+	
+	#find the location of the prefix and this is the offset
+	$offset = find_offset($key,$i);
+
+	$encoded_offset = encode_offset($offset,$i);
+	
+	$start = length ($key)-1 ;
+	$suffix= substr $key, $start ,1;
+
+	print "$encoded_offset , $suffix \n";
+
+	if ($encoded_offset != -1){
+	    $concat_encoded_suffix = $encoded_offset . $suffix;
+	}
+	else{
+	    $concat_encoded_suffix = $suffix;
+	}
+       # print "for $key whose suffix is $suffix encoded offset is $encoded_offset" . "\n";
+       #  print "the concatenated suffix and the offset is $concat_encoded_suffix" ."\n";
+	$compressed_string = $compressed_string . $concat_encoded_suffix;
+       
+    }
+    print  "----------".$compressed_string."-------";
+    return $compressed_string;
+
+}
+
+
+
+# ------------------------------------------------------------------------------
+# TESTS
+
+=pod
+print find_offset("0"); print "\n";
+print find_offset("1");  print "\n";
+print find_offset("10"); print "\n";
+print find_offset("100"); print "\n";
+print find_offset("101"); print "\n";
+print find_offset("1011"); print "\n";
+#print find_offset("00"); print "\n";
+#print find_offset("110"); print "\n";
+#print find_offset("1100"); print "\n";
+#print find_offset("001"); print "\n";
+=cut
+
+=pod
+print encode_offset(find_offset("0"),"0"); print "\n";
+print encode_offset(find_offset("1"),"1");  print "\n";
+print encode_offset(find_offset("11"),"11"); print "\n";
+print encode_offset(find_offset("01"),"01"); print "\n";
+print encode_offset(find_offset("010"),"010"); print "\n";
+print encode_offset(find_offset("111"),"111"); print "\n";
+print encode_offset(find_offset("00"),"00"); print "\n";
+print encode_offset(find_offset("110"),"110"); print "\n";
+print encode_offset(find_offset("1100"),"1100"); print "\n";
+print encode_offset(find_offset("001"),"001"); print "\n";
+=cut
+
+$compressed_string =  iterate();
+print "\nthe compressed string is : $compressed_string \n";
+# ------------------------------------------------------------------------------
+

File LPZ-decompression.pl

+use POSIX qw(ceil floor);
+
+
+# split the string into smaller blocks
+sub split_into_array{
+    
+
+    $compressed_string = $_[0];
+    @blocks;
+    $block_count =1;
+
+    for (my $i = 0 ; $i < length($compressed_string) ; $i++){
+
+	$current_block_length = ceil((log($block_count)/ log(2))) + 1;
+
+	$start =$i;
+	$end   =$current_block_length;
+
+	$current_substring = substr $compressed_string ,$start,$end;
+	
+#	print $current_substring . "\n";
+	push(@blocks,$current_substring);
+
+	$i = $i+ $current_block_length-1;
+	$block_count =$block_count + 1;
+
+    }
+
+# split the string into smaller blocks
+sub split_into_array{
+    
+
+    $compressed_string = $_[0];
+    @blocks;
+    $block_count =1;
+
+    for (my $i = 0 ; $i < length($compressed_string) ; $i++){
+
+	$current_block_length = ceil((log($block_count)/ log(2))) + 1;
+
+	$start =$i;
+	$end   =$current_block_length;
+
+	$current_substring = substr $compressed_string ,$start,$end;
+	
+#	print $current_substring . "\n";
+	push(@blocks,$current_substring);
+
+	$i = $i+ $current_block_length-1;
+	$block_count =$block_count + 1;
+    return @blocks;
+}
+
+sub decompress{
+
+    @prefix_array ;
+    @string_constructed;
+    $final_string ="";
+# split the string into smaller blocks
+sub split_into_array{
+    
+
+    $compressed_string = $_[0];
+    @blocks;
+    $block_count =1;
+
+    for (my $i = 0 ; $i < length($compressed_string) ; $i++){
+
+	$current_block_length = ceil((log($block_count)/ log(2))) + 1;
+
+	$start =$i;
+	$end   =$current_block_length;
+
+	$current_substring = substr $compressed_string ,$start,$end;
+	
+#	print $current_substring . "\n";
+	push(@blocks,$current_substring);
+
+	$i = $i+ $current_block_length-1;
+	$block_count =$block_count + 1;
+
+    
+    for (my $i = 0 ; $i < scalar @blocks ;$i++){
+
+	$current_string  = $blocks[$i];
+	$length = length($current_string);
+	$prefix_length = $length - 1;
+	print "Current substring is $current_string and the length of prefix is $prefix_length \n";
+
+	$prefix = substr $current_string , 0, $prefix_length;
+	$decimal_prefix = bin2dec($prefix);
+	print "The Prefix taken is $prefix \n";
+	print "prefix converted to decimal is $decimal_prefix \n";
+
+	$suffix = substr $current_string, $length-1 ,1;
+	print "the suffix is $suffix \n ";
+
+	# pair = $decimal_prefix $suffix
+
+	push(@prefix_array,$prefix);
+
+	
+	$offset = $decimal_prefix;
+	$current_location = $i;
+	
+	if ($decimal_prefix <= 0){
+	    $concatenated = $suffix;
+	}
+	else{
+	    $new_location = $current_location  - $offset ;
+	    $concatenated = $string_constructed[$new_location]. $suffix;
+
+	}
+	push(@string_constructed, $concatenated);
+	$final_string = $final_string .$concatenated;
+	print "the concatenated string is $concatenated. string so far is $final_string \n --------\n";
+
+    }
+    print "FINAL STRING--> $final_string";
+    return $final_string;
+}
+
+
+# convert from binary to decimal
+sub bin2dec {
+    return unpack("N", pack("B32", substr("0" x 32 . shift, -32)));
+}
+
+#convert character to ascii
+sub convert_to_ascii_char {
+
+    $char = $_[0];
+    $decimal_form = bin2dec($char);
+    $ascii_form = chr($decimal_form);
+    return $ascii_form;
+}
+
+# convert to ascii
+sub convert_to_ascii_string{
+    
+    $longText = $_[0];
+    print "\nSplitting the \n$longText\n";
+
+    my @split_array = ($longText =~ m/.{8}/g );
+
+    $bigger_string = "";
+
+    for (my $i = 0 ; $i < scalar @split_array ;$i++){	
+	$current_binary = $split_array[$i];
+	print "\nCURRENT BINARY " .$current_binary;
+	$ascii_form  =convert_to_ascii_char($current_binary);
+	$bigger_string .= $ascii_form;
+
+    }
+
+    return $bigger_string;
+
+}
+
+
+# ------------------------------------------------------------------------------
+#$compressed_main = "00101111100100111110010100001000111";
+$compressed_main = "001010011001101000100110101100001000101000100110000111101110";
+
+$decompressed = decompress(split_into_array($compressed_main));
+$asciid = convert_to_ascii_string($decompressed);
+print "\nDecompressed Character is -------> $asciid";
+# ------------------------------------------------------------------------------
+# Lempel Ziv Algorithm
+# ------------------------------------------------------------------------------
+
+#   Compression Algorithm
+#1. Convert it to ASCII
+#2. Convert to Binary
+#3  Make Unique substrings
+#4. Convert to the (position of old guys ,new guy) (back ref ,   suffix)
+#5. Encode  backref(position) as binary
+#6. Combine the entire thing into one big block
+
+#   Decompression
+
+# ------------------------------------------------------------------------------
+# Compression Algorithm
+
+# Convert Each letter to ASCII
+
+
+sub get_each_character{
+
+
+}
+
+
+
+
+# Compress a string to a list of output symbols.
+sub compress {
+    my $uncompressed = shift;
+
+    # Build the dictionary.
+    my $dict_size = 256;
+    my %dictionary = map {chr $_ => chr $_} 0..$dict_size-1;
+
+    my $w = "";
+    my @result;
+
+    foreach my $c (split '', $uncompressed) {
+
+        my $wc = $w . $c;
+
+        if (exists $dictionary{$wc}) {
+            $w = $wc;
+        } else {
+            push @result, $dictionary{$w};
+            # Add wc to the dictionary.
+            $dictionary{$wc} = $dict_size;
+            $dict_size++;
+            $w = $c;
+        }
+    }
+
+    # Output the code for w.
+    if ($w) {
+        push @result, $dictionary{$w};
+    }
+
+    print $result;
+
+    foreach $result (@result){
+      	$bin = unpack("B*", pack("N", ord($result)));
+        $bin =~ s/^0+(?=\d)//;
+    	print "$result $bin \n";
+        $fullstring .= $bin;
+
+    }
+
+    print $fullstring;
+
+    my %uniquesubstring = ();
+
+
+    for ($j=0;$j<length($fullstring);$j++){
+        for ($i=1;$i<(length($fullstring) - $j);$i++){
+                my $fragment = substr $fullstring, $j, $i;
+                if (not (exists $uniquesubstring{$fragment})){
+                        print "$fragment\n";
+                        $uniquesubstring{$fragment} = 0;
+                }
+    push @uniquesubstring, $fragment;
+    print "$fragment \n";
+        }
+    }
+
+    foreach $key (%uniquesubstring){
+        print "$key \n";
+    }
+
+}
+
+
+# How to use:
+my @compressed = compress('abc');
+#print "@compressed\n";

File test - Main Copy.pl

+# Lempel Ziev
+
+# ------------------------------------------------------------------------------
+#   Compression Algorithm
+#1. Convert it to ASCII---done
+#2. Convert to Binary ---done
+#3  Append-. Make Unique substrings
+#4. Convert to the (position of old guys ,new guy) (back ref ,   suffix)
+#5. Encode  backref(position) as binary
+#6. Combine the entire thing into one big block
+
+# ------------------------------------------------------------------------------
+
+# finds the eight bit of the given character
+sub get_eight_bin{
+ $result = $_[0];
+ $bin = unpack("B*", pack("c", ord($result)));
+ # $bin =~ s/^0+(?=\d)//;
+ # $bin =~ s/\d\d\d\d\d\d\d\d\$//;
+ #return substr $bin,24,32;
+ return $bin;
+}
+
+
+
+# ------------------------------------------------------------------------------
+
+
+# find the unique substring
+
+# keys
+
+# key -> backref
+# 11  -> 1 
+%key_blockno = ();
+
+# key -> position
+# 11  -> 2
+%suffix_position = ();
+
+sub unique_substring{
+
+    my $string = $_[0];
+    my $length =  length $string;
+    $count = 0;
+
+    for (my $i =0 ;$i < $length ;$i++){
+
+        my $ch = substr $string,$i,1;
+	print "now i position is ". $i;
+	print "\n";
+   
+	if (not (exists $key_backref{$ch})){
+	    print "putting in " . $ch ;
+	    print "\n";
+	    $key_backref{$ch} =  $count;
+            $count = $count + 1;
+	}
+	else {
+	     for (my $j = ($i +1) ; $j < length $string ;$j++){
+		# take the substring $i to $j
+		# check if it this substring is there
+		# if not there put in in the hash -> break -> $i= $j+1
+
+		my $new_substring = substr $string, $i, ($j-$i+1);
+
+		if (not (exists $key_backref{$new_substring})){
+		    print "putting in" . $new_substring;
+		    print "\n";
+		    $key_backref{$new_substring} = $count;
+		    
+		    $i = $j;
+		    print "j position". $j;
+		    print "\n";
+		    $count = $count + 1;
+		    last;
+		}
+		
+	     }
+	}
+
+    }
+    
+}
+
+# ------------------------------------------------------------------------------
+
+$string = "usa";
+# 011101010111001101100001
+
+@single_character_array = split('', $string);
+
+
+foreach my $character(@single_character_array){
+    $append_binary .= get_eight_bin($character);
+
+}
+
+unique_substring($append_binary);
+
+foreach my $key (%key_backref){
+    print $key;
+    print "\n";
+}
+
+
+
+# what are we storing? 11
+# what is the prefix? substr 11,0,length-1
+# find the position of the prefix -> %key_position(suffix)
+# offset = current_position - (position of the prefix)
+# array[] = {offset->%suffix  }
+
+# for (i = 0 to array.length)
+# position = i
+# offset = key
+# suffix = value
+# number_of_bits = log2position 
+# convert to binary offset with the number_of_bits
+# start appending
+
+# final append is the compressed one.