Академический Документы
Профессиональный Документы
Культура Документы
Data Compression
L. J. Institute of Engineering & Technology S.G. Highway, Ahmedabad-382210 CE/IT Department Practical List
VIII - IT
Subject Name: Subject Code: Branch & Sem: SR. NO. 1 Sr. No.
Term DurationFrom: 2-Jan-2012 To: 28-Apr-2012 TEACHING SCHEME (HOURS) THEORY TUTORIAL PRACT. 4 0 2 Practical Week
CREDITS
6 Submission Week
Write a program to count the occurrences of different letters by reading the given text file and also find the probability of each letter 1. with number of bits required for them using the formula: No. of bits=1/log2 probi Write a program in C to determine whether the set of given codes is 2. uniquely decodable or not.
3. 4. 5. 6. 7. 8.
9-Jan-2012 14-Jan-2012 16-Jan-2012 21-Jan-2012 23-Jan-2012 28-Jan-2012 30-Jan-2012 4-Feb-2012 6-Feb-2012 to 11-Feb-2012 12-Mar-2012 17-Mar-2012 19-Mar-2012 17-Mar-2012 26-Mar-2012 31-Mar-2012 2-Apr-2012 to 7-Apr-2012
16-Jan-2012 21-Jan-2012 23-Jan-2012 28-Jan-2012 30-Jan-2012 4-Feb-2012 6-Feb-2012 11-Feb-2012 13-Feb-2012 18-Feb-2012 19-Mar-2012 17-Mar-2012 26-Mar-2012 31-Mar-2012 2-Apr-2012 7-Apr-2012 9-Apr-201214-Apr-2012 Final Submission
Study of Huffman Compression Algorithm Study of Shannon-Fano compression Algorithm Write a program to implement arithmetic coding Write a program to implement lz77 algorithm. Write a program to implement lz78 algorithm. Write a program to implement lzss algorithm.
9.
Page 1
LJIET
VIII - IT
AIM: Write a program to count the occurrences of different letters by reading the given text file and also find the probability of each letter with number of bits required for them using the formula: No. of bits=1/log2 probi
Description of Algorithm Open a text file in read mode. Read the file character by character and store it in temporarily and compare it with defined set of characters, calculate the occurrences of all characters. Find the probability of occurrence of each character in the file using the equation: Probability =No. of occurrences of a character/Total No. of characters. Find the number of bits required to encode each letter: Bits = ((log (Probi ))/log2)*(-1) Find total number of bits required to encode the file
Example: Contents of text file: aabbccd Probability of each character and number of bits required: a=0.2857 b=0.2857 c=0.2857 d=0.1428
No of Bits Required After Compression for Each Character a=2 b=2 c=2 d=3
Total Number of Bits Required After Compression: (2*2) + (2*2) + (2*2) + (3*1) = 1
Source Code
#include<stdio.h> Page 2
LJIET
#include<math.h> void main() { float nofchar=0,prob[26]; float y[26],s[26],x=0; FILE *fp; int b[26],j=0,h=0; char I;
Data Compression
VIII - IT
a[27]={a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x ,y,z}; clrscr(); fp=fopen("c:\\tc\\bin\\text,txt","r+"); fseek(fp,0,0); for(j=0;j<=25;j++) { b[j]=0; } printf("Data of the File : "); while(!feof(fp)) { i=getc(fp); //Reading Characters of File printf("%c",i); for(j=0;j<=25;j++) { //Comparing Characters With Defined Set Characters if(i==a[j] || i==(a[j]-32)) { b[j]++; h++; } } } printf("\nNo. of Bits Required Before Compression:%d\n\n",(h*7)); for(j=0;j<=25;j++) { nofchar=nofchar+b[j]; } printf("Probability Of Each Character :\n\n"); //Calaculating Proability of Each Character for(j=0;j<=25;j++) { prob[j]=b[j]/nofchar; printf("%c = %f\t",a[j],prob[j]); } Page 3
LJIET
Data Compression
printf("\n\n"); printf("Bits Required per character : \n\n"); //Calculating Bits Required for Each Character for(j=0;j<=25;j++) { if(prob[j]==0) { s[j]=0; y[j]=0; continue; } y[j]=((log(prob[j])/log(2))); y[j]=ceil((-1)*y[j]); s[j]=y[j]*b[j]; //Calculating Total Bits Required After Compression x=x+s[j]; printf("%c = %f\t",a[j],y[j]); } printf("\n\nNo of Bits Required After Compression: "); printf("%f",x); printf("\n\nCompression Ratio = %f",((h*7)/x)); getch();
VIII - IT
Advantage: It will be useful to find how much compression will be done by encoding file by value of Compression Ratio. Disadvantage: Because of large number of loops and arrays, memory usage is more. Limitation: If there are characters other than alphabets, then it does not support them.
Page 4
LJIET
Input: Text file with contents: lossless
Data Compression
VIII - IT
Output
e = 3.000000
l = 2.000000
o = 3.000000
s = 1.000000
Page 5
LJIET
VIII - IT
AIM: Write a program in C to determine whether the set of given codes is uniquely decodable or not. Description of algorithim:
Get the set of codes for which uniquely decidability has to be determined. Suppose we have two code words: o A having length of k bits o B having length of n bits where n>k. If 1st k bits of symbol B are identical to k bits of symbol A then remaining bits are dangling suffix Calculate the dangling suffix for the given set of codes. If the dangling suffix is itself a codeword for a symbol then the code is not a uniquely decodable code. Else the given codeword is a uniquely decodable code.
Page 6
LJIET
Code for the practical:
Data Compression
VIII - IT
#include<stdio.h> #include<conio.h> #include<string.h> void main() { int i,n,j,k=0,si,sj,a,b,c; char *data[10],temp[10],dang[10],x[2]; clrscr(); printf("Enter the no of elements:"); scanf("%d",&n); for(i=0;i<n;i++) { printf("\n%d.",i+1); fflush(stdin); gets(temp); data[i]=(char *)malloc(sizeof(temp)); strcpy(data[i],temp); } k=i; printf("\n"); for(i=0;i<n;i++) { for(j=1;j<n;j++) { si=strlen(data[i]); sj=strlen(data[j]); if(si<sj && j!=i) { strcpy(temp,""); for(a=(strspn(data[i],data[j]));a<sj;a++) { x[0]=*(data[j]+a); x[1]='\0'; strcat(temp,x); } data[k]=(char *)malloc(strlen(temp)); strcpy(data[k],temp); k++; for(c=n-1;c<=k-1;c++) { for(b=n;b<=k-1;b++) Page 7
LJIET
Data Compression
{ if(c!=b &&
VIII - IT
(strlen(data[c])==strlen(data[b]))) { if(strncmp(data[c],data[b],strlen(data[c]))==0) k--; } } } } if(sj<si && i!=j) { strcpy(temp,""); for(a=(strspn(data[j],data[i]));a<si;a++) { x[0]=*(data[i]+a); x[1]='\0'; strcat(temp,x); } data[k]=(char *)malloc(strlen(temp)); strcpy(data[k],temp); k++; for(c=n-1;c<=k-1;c++) { for(b=n;b<=k;b++) { if(c!=b && (strlen(data[c])==strlen(data[b])) && strncmp(data[c],data[b],strlen(data[c]))==0) { k--; } } } } } } printf("\nThe new Array is:\n"); for(i=0;i<=k-1;i++) { printf("\n%s",data[i]); } for(i=0;i<=k-1;i++) { for(j=i+1;j<=k-1;j++) Page 8
LJIET
{
Data Compression
VIII - IT
if(strlen(data[i])==strlen(data[j])) { if(strncmp(data[i],data[j],strlen(data[i]))==0) { printf("\nCode is not uniquely decodable"); goto end; } } } } printf("\nThe code is uniquely decodable"); end: getch(); } Input/Output Enter the no of elements:3 1.1 2.10 3.100
The new Array is: 1 10 100 0 00 The code is uniquely decodable Output 2: Enter the no of elements:3 1.1 2.10 3.101
Page 9
LJIET
The new Array is: 1 10 101 0 01 1 Code is not uniquely decodable
Data Compression
VIII - IT
ADVANTAGES: We can determine whether the given code is uniquely decodable or not.
Page 10
LJIET
VIII - IT
Bottom up approach:According to the figures, in the first figure we can see that only two nodes arranged contains weight 3 & 4 from right to left and those nodes are the leaves of the binary Huffman tree and tree is drawn up to the root of the tree as shown in figure no 2 & figure no 3.
Fig no 1
Fig no 2
Fig no 3
Steps for building the tree:1. The two free nodes with the lowest weight are located. 2. Parent node for these two nodes are created it is assigned a weight equal to the total weight of the two child nodes. 3. Now the parent node is added to the list of free nodes and also removes the two child nodes from free node list. 4. One of the child node is designated as decoding as 0 bit and another set to 1 5. Steps from 1 to 4 repeat until only one free node is left.
Example:
Page 11
LJIET
Data Compression
VIII - IT
Suppose five symbols are laid down along with there frequencies as shown below 28 A 6 B 11 C 17 D 31 E
Solution Steps:
0 0 13 E (31) A (28) D (17) C (11) F (7) B (6) E (31) A (28) D (17) C (11) F (7) 1 24 1 0 13 B (6) 1
Step 3 0 1 100
Step 4
0 41
1 0 1 24 0 13 1 B (6)
0 59 E (31)
F (7)
A B C D E F Final
Step 5
Nodes are removed from the free list and parent node is added to the free list and repeat the procedure from step 1 to 4 so that we can get below Huffman tree.
To determine the code for a given symbol, we have to walk from the leaf node to root node at the Huffman tree, unfortunately the bits are written in reverse order that we want, that means we have to push bits on to the stack then pop of the generated code.
LJIET
Data Compression
VIII - IT
1. Data structure used in Huffman tree typedef struct tree_node { unsigned int count; unsigned int saved_count; int child_0; int child_1; } NODE; Each node in Huffman tree has several information like count has value of weight associated with it. saved_count has the value when the nodes are taken off from the active list then its count is set to 0, count before setting are saved on saved_count. Child_0 & Child_1 are used to point to the node which is its child node. 1 is child_0 which has value 0 and another is child_1 which has value 1.
Functions:1. count_bytes() :This function counts the every occurrences of the character from start of the file to the end. And also it calculates the position of the pointer is saved when the count starts and restore when it is done. 2. Outout_counts():This means that I store runs of counts, until all the non-zero counts have been stored. At this time the list is terminated by storing a start value of 0. Note that at least 1 run of counts has to be stored, so even if the first start value is 0, I read it in. It also means that even in an empty file that has no counts, I have to pass at least one count. In order to efficiently use this format, I have to identify runs of non-zero counts. Because of the format used, I don't want to stop a run because of just one or two zeros in the count stream. So I have to sit in a loop looking for strings of three or more zero values in a row. This is simple in concept, but it ends up being one of the most complicated routines in the whole program. A routine that just writes out 256 values without attempting to optimize would be much simpler, but would hurt compression quite a bit on small files. 3. Input_counts():When expanding, I have to read in the same set of counts. This is quite a bit easier that the process of writing them out, since no decision making needs to be done. All I do is read in first, check to see if I am all done, and if not, read in last and a string of counts.
4. build_tree():Page 13
LJIET
Data Compression
VIII - IT
Build the Huffman tree after counts have been loaded. This function finds the two minimum weighted node or lowest frequency symbol. At the starting all the 257 nodes have a count value set to that frequency counts and a non zero value this means that the node is active node. After finding two lowest freq. node calculate the parent node and its weight which is equals to total weight of two nodes. Now it set this two nodes as set ot 0 means this two nodes are now inactive so the next time two another lowest freq. nodes will be taken. 5. expand_node():starting at the root node, a single bit at a time is read in by the decoder, if the bit is 0, the next node is one pointed to the child_0 index otherwise pointed to the child_1 index. If the symbol has the specific end of stream symbol, we can exit instead of sending it. 6. void scale_counts( counts, nodes ):unsigned long *counts; NODE *nodes; In order to limit the size of my Huffman codes to 16 bits, I scale my counts down so they fit in an unsigned char, and then store them all as initial weights in my NODE array. The only thing to be careful of is to make sure that a node with a non-zero count doesn't get scaled down to 0. Nodes with values of 0 don't get codes. 7. int build_tree( nodes ):NODE *nodes; Building the Huffman tree is fairly simple. All of the active nodes are scanned in order to locate the two nodes with the minimum weights. These two weights are added together and assigned to a new node. The new node makes the two minimum nodes into its 0 child and 1 child. The two minimum nodes are then marked as inactive. This process repeats until their is only one node left, which is the root node. The tree is done, and the root node is passed back to the calling routine. Node 513 is used here to arbitrarily provide a node with a guaranteed maximum value. It starts off being min_1 and min_2. After all active nodes have been scanned, I can tell if there is only one active node left by checking to see if min_1 is still 513. 8. void convert_tree_to_code( nodes, codes, code_so_far, bits, node ):NODE *nodes; CODE *codes; unsigned int code_so_far; int bits; int node; Since the Huffman tree is built as a decoding tree, there is no simple way to get the encoding values for each symbol out of it. This routine recursively walks through the Page 14
LJIET
Data Compression
VIII - IT
tree, adding the child bits to each code until it gets to a leaf. When it gets to a leaf, it stores the code value in the CODE element, and returns. 9. void print_model( nodes, codes ):NODE *nodes; CODE *codes; If the -d command line option is specified, this routine is called to print out some of the model information after the tree is built. Note that this is the only place that the saved_count NODE element is used for anything at all, and in this case it is just fordiagnostic information. By the time I get here, and the tree has been built, every active element will have 0 in its count. 10. void print_char( c ):int c; The print_model routine uses this function to print out node numbers.The catch is, if it is a printable character, it gets printed out as a character. Makes the debug output a little easier to read. 11. void compress_data( input, output, codes ):FILE *input; BIT_FILE *output; CODE *codes;
Once the tree gets built, and the CODE table is built, compressing the data is a breeze. Each byte is read in, and its corresponding Huffman code is sent out. 12. void expand_data( input, output, nodes, root_node ):BIT_FILE *input; FILE *output; NODE *nodes; int root_node; Expanding compressed data is a little harder than the compression phase. As each new symbol is decoded, the tree is traversed, starting at the root node, reading a bit in, and taking either the child_0 or child_1 path. Eventually, the tree winds down to a leaf node, and the corresponding symbol is output. If the symbol is the END_OF_STREAM symbol, it doesn't get written out, and instead the whole process terminates.
Source code :
#include<stdio.h> Page 15
LJIET
#include<iostream.h> #include<conio.h> #include<string.h> #include<math.h> float sum=0; typedef struct node { float prob; node *lptr; node *rptr; int value; int check; }node;
Data Compression
VIII - IT
class Huffman { char arr[20], sortArr[20]; float prob[20], sortProb[20]; int noofbits[20]; int comp[20][10], counter[20], nodeCount; int prev[10], symbol[10], i, k; public: Huffman(); void getData(); void probability(); void sort(); void compress(); void inorder(node *); }; Huffman:: Huffman() { int m, n; for(m=0;m<20;m++) { counter[m]=0; sortArr[m]=0; } i=0; k=0; getData(); } Page 16
LJIET
Data Compression
VIII - IT
void Huffman :: inorder(node *root) { if(root) { prev[i]=0; if(root->value!=36) { symbol[k]=(int)root->value; printf(": %c :",root->value); for(int j=0;j<i;j++) { comp[k][j]=prev[j]; } //cout<<"\n"; for(int m=0;m<i;m++) { cout<<comp[k][m]; } k++; i--; } else i++; inorder(root->rptr); prev[i]=1; if(root->value!=36) { symbol[k]=root->value; cout<<root->value; printf(": %c :",root->value); for(int j=0;j<i;j++) { comp[k][j]=prev[j]; } cout<<"\n"; for(int m=0;m<i;m++) { cout<<comp[k][m]; } k++; i=0; } else i++; Page 17
// //
LJIET
Data Compression
VIII - IT
inorder(root->lptr); } } void Huffman :: getData() { int len; clrscr(); cout<<"Enter any string : "; cin>>arr; len=strlen(arr); probability(); } void Huffman:: probability() { float len; char prev, curr; char done[10]; int flag,count; len=strlen(arr); for(int i=0;i<len;i++) { count=0; prev=arr[i]; flag=0; for(int j=0;j<len;j++) { curr=arr[j]; if(prev==curr) { count++; } } prob[i]=(int)count/len;
Page 18
LJIET
Data Compression
VIII - IT
} sort(); } void Huffman:: sort() { int done[20], flag, count; char temp, curr; float currProb, tempInt; int len, currInt;
len=strlen(arr); count=0; for(int i=0;i<len;i++) { flag=0; curr=arr[i]; currProb=prob[i]; currInt=curr; for(int j=0;j<count;j++) { if(currInt==done[j]) { flag=1; } }
LJIET
}
Data Compression
VIII - IT
for(int m=1;m<count;m++) { for(int n=m;n<count;n++) { if(sortProb[m]<sortProb[n]) { tempInt=sortProb[n]; sortProb[n]=sortProb[m]; sortProb[m]=tempInt; temp=sortArr[n]; sortArr[n]=sortArr[m]; sortArr[m]=temp; } } } for(i=0;i<count;i++) { cout<<"\nChar : "<<sortArr[i]<<" Prob : "<<sortProb[i]; sum=sum+(sortProb[i]*log(sortProb[i])); } sum=(-1)*sum; nodeCount=count; compress(); } void Huffman :: compress() { int i,j,k,l,m,n,flag=0; float tempProb, temp; int tempCount, count, tempVal, tempInt; node *start=new node; node *tempNode=new node; tempNode=NULL; tempCount=nodeCount; for(i=0;i<tempCount-1;i++) { node *node1=new node; node1->prob=sortProb[tempCount-i-1]; node1->value=sortArr[tempCount-i-1]; node1->lptr=NULL; Page 20
LJIET
Data Compression
node1->rptr=NULL;
VIII - IT
if(node1->value==start->value && node1->prob==start->prob) { node1=start; flag=1; } else if(node1->value==tempNode->value && node1->prob==tempNode>prob) { node1=tempNode; flag=1; } node *node2=new node; node2->prob=sortProb[tempCount-i-2]; node2->value=sortArr[tempCount-i-2]; node2->lptr=NULL; node2->rptr=NULL; if(node2->value==start->value && node1->prob == start->prob) { node2=start; flag=1; } else if(node2->value==tempNode->value && node1->prob==tempNode>prob) { node2=tempNode; flag=1; } if(flag==0) { tempNode=start; } else flag=0; node *node3=new node; if(nodeCount==2) { node3->prob=sortProb[nodeCount-1]+sortProb[nodeCount-2]; node3->lptr=tempNode; Page 21
LJIET
Data Compression
node3->rptr=start; } else { node3->prob=node1->prob+node2->prob; node3->lptr=node1; node3->rptr=node2; } node3->value=(int)'$'; start=node3; nodeCount--; tempProb=node3->prob; tempVal=node3->value; sortProb[nodeCount-1]=tempProb; sortArr[nodeCount-1]=tempVal; count=nodeCount; for(int m=0;m<count;m++) { for(int n=m;n<count;n++) { if(sortProb[m]<sortProb[n]) { temp=sortProb[m]; sortProb[m]=sortProb[n]; sortProb[n]=temp; tempInt=sortArr[m]; sortArr[m]=sortArr[n]; sortArr[n]=tempInt; } } }
VIII - IT
LJIET
}
Data Compression
VIII - IT
Output : Enter any string : aaabbcdeee Char : a Char : e Char : b Char : d Char : c Char : a Char : e Char : b Char : $ Prob : 0.3 Prob : 0.3 Prob : 0.2 Prob : 0.1 Prob : 0.1 Prob : 0.3 Prob : 0.3 Prob : 0.2 Prob : 0.2
Char : $ Prob : 0.4 Char : e Prob : 0.3 Char : a Prob : 0.3 Char : $ Prob : 0.6 Char : $ Prob : 0.4 Char : $ Prob : 1.0
Page 23
LJIET
Data Compression
VIII - IT
Advantages: Huffman coding could perform effective data compression by reading the amount of redundancy in the coding of symbols. It generates an optimal code. Symbols that occur more frequently have shorter code words.
Disadvantages: Huffman codes have to be an integral number of bits long and this can sometimes be a problem. If the probability of a character is 1/3. For example the optimum number of bits to code that character is around 1.8 bits. Non optimal coding becomes a noticeable problem when the probability of a character is very high. If a statistical method could assign 90 percent probability to a given character , the optimal code size could be 0.15 bits , but the Huffman coding system would probably assign a 1 bit code to the symbol, which is six times larger than necessary.
Applications: Lossless image compression: - A simple application of Huffman coding to image compression would be to generate a Huffman code for the set of value that may take. Text compression: - In text, we have a discrete alphabet that, in a given class has relatively stationary probabilities. Auto compression: - another class of data that is very suitable for compression is CD-quality audio data.
Page 24
LJIET
VIII - IT
Shannon Fano algorithm:This model is based on probability model. The decription of this algorithm is given below:Step1:- for given list of symbols develop corresponding list of probability or frequency so that each symbol relative frequency of occurrence is known. Step2:- Sort the list of symbols according to frequency of occurrence with most occurring symbol at top and least occurring at bottom. Step3:- Divide list into two parts with the total frequency count of upper half being as closed to total of bottom half. Step4:- The upper half of list is assign by 0 and lower half by 1 this means that code in first half start with 0 & rates with 1. Step5:- Recursively apply step 3 & 4 to each of two half subdividing groups and adding bits to code until each symbol has become corresponding code leaf for tree. For the probability distribution already provided, Figure Error! No text of specified style in document.-1 illustrates the steps involved in coding the characters. Page 25
LJIET
Data Compression
VIII - IT
Table Error! No text of specified style in document.-1analyses the efficiency of ShannonFano algorithm. As mentioned earlier, the concept of entropy is used to measure the efficiency of any given algorithm. As against a total information content of 233 bits, 237 bits is being used for carrying the message. This translates into an overhead of 1.4% {(237-233)/23}. The basic reason for this overhead is the approximation involved in rounding off the information content of a character to the nearest integer. For example, although E has an information content of 1.69, 2bits are used to encode it. This alone accounts for the 9.61bit overhead (62 52.39). Although rounding off may offer some positive overhead also (see row entries of C), the end result is using up more bits that is required to code the message.
E (31) A (28) D (17) C (11) Sort in descending F (7) order B (6) Step 1 E (31) A (28) D (17) C (11) F (7) B (6) Step 5 00 01 10 11* E (31) A (28) D (17) C (11) F (7) B (6) Step 2 E (31) A (28) D (17) C (11) F (7) B (6) Step 6 E (31) A (28) D (17) C (11) F (7) B (6) Step 3 E (31) A (28) D (17) C (11) F (7) B (6) Step 7 0* E (31) A (28) D (17) C (11) F (7) B (6) Step 4 00 01 1*
1*
00 01 10 110 111*
Figure Error! No text of specified style in document.-1: Steps of Shannon-Fano theorem Table Error! No text of specified style in document.-1: Analysis of Shannon-Fano algorithm Total Total Number of Probability Information Bits used Characte Entropy number occurrences of content to code the r (- log2P) of bits in a message occurrence (occurrence message used * entropy) A 28 0.28 1.83 51.24 2 (01) 56 B 6 0.06 4.05 24.3 4 (1111) 24 C 11 0.11 3.18 34.98 3 (110) 33 D 17 0.17 2.55 43.35 2 (10) 34 E 31 0.31 1.69 52.39 2 (00) 62 F 7 0.07 3.83 26.81 4 (1110) 28 233.07 237
Page 26
Data Compression
VIII - IT
class ShannonFano { char arr[20], sortArr[20]; float prob[20], sortProb[20]; int comp[20][10], counter[20]; public: ShannonFano(); void getData(); void probability(); void sort(); void compress(); int split(int, int, int); void print(); }; ShannonFano :: ShannonFano() { int m, n; for(m=0;m<20;m++) { counter[m]=0; sortArr[m]=0; } getData(); } void ShannonFano :: getData() { int len,i=0; char ch; FILE *fp; clrscr();
Page 27
LJIET
Data Compression
if((fp=fopen("c:\\dc.txt","r"))==NULL) { printf("CAN NOTOPEN THE FILE "); getch(); exit(0); } while(!feof(fp)) { ch=fgetc(fp); arr[i]=ch; i++; }
VIII - IT
len=strlen(arr); probability(); } void ShannonFano :: probability() { float len; char prev, curr; char done[10]; int flag,count; len=strlen(arr); for(int i=0;i<len;i++) { count=0; prev=arr[i]; flag=0; for(int j=0;j<len;j++) { curr=arr[j]; if(prev==curr) { count++; } } prob[i]=(int)count/len;
Page 28
LJIET
sort(); } void ShannonFano :: sort() { int done[20], flag, count; char temp, curr; float currProb, tempInt; int len, currInt;
Data Compression
VIII - IT
len=strlen(arr); count=0; for(int i=0;i<len;i++) { flag=0; curr=arr[i]; currProb=prob[i] currInt=curr; for(int j=0;j<count;j++) { if(currInt==done[j]) { flag=1; } } if(flag==1) { } else { sortArr[count]=curr; sortProb[count]=currProb; done[count]=currInt; count++; } } for(int m=0;m<count;m++) { for(int n=m;n<count;n++) Page 29
LJIET
{
Data Compression
VIII - IT
if(sortProb[m]<sortProb[n]) { tempInt=sortProb[m]; sortProb[m]=sortProb[n]; sortProb[n]=tempInt; temp=sortArr[m]; sortArr[m]=sortArr[n]; sortArr[n]=temp; } } } for(i=0;i<count;i++) { cout<<"\nChar : "<<sortArr[i]<<" Prob : "<<sortProb[i]; } compress(); }
void ShannonFano :: compress() { float prev, curr; int mid, len, done, flag1, flag2; float sum1, sum2; int cut,cut1; flag1=1; flag2=1; len=strlen(sortArr); prev=1; sum1=0; done=0; sum2=0; for(int i=0;i<len;i++) { sum1+=sortProb[i]; sum2=1-sum1; if(sum1>sum2) Page 30
LJIET
Data Compression
curr=sum1-sum2; else curr=sum2-sum1; if(curr<prev) { prev=curr; mid=i; } } split(0,mid,0); split(mid+1,len-1,1); print();
VIII - IT
int ShannonFano :: split(int lower, int upper, int flag) { float sum1, sum2, curr, prev, total; int low, up, mid; if(upper-lower==0) { comp[upper][counter[upper]]=flag; counter[upper]++; } else if(upper-lower==1) { comp[upper][counter[upper]]=flag; counter[upper]++; comp[upper][counter[upper]]=1; counter[upper]++; comp[lower][counter[lower]]=flag; counter[lower]++; comp[lower][counter[lower]]=0; counter[lower]++; } else { Page 31
LJIET
Data Compression
for(int i=lower;i<=upper;i++) { comp[i][counter[i]]=flag; counter[i]++; } sum1=0; sum2=0; low=lower; up=upper; prev=1; total=0; for(int j=lower;j<=upper;j++) { total+=sortProb[j]; } for(j=lower;j<=upper;j++) { sum1+=sortProb[j]; sum2=total-sum1; if(sum1>sum2) curr=sum1-sum2; else curr=sum2-sum1; if(curr<prev) { prev=curr; mid=j; } } split(low,mid,0); split(mid+1,up,1); }
VIII - IT
LJIET
Data Compression
VIII - IT
cout<<"\n"; for(i=0;i<len;i++) { cout<<"\n"<<(char)sortArr[i]<<" : "; for(j=0;j<counter[i];j++) { cout<<comp[i][j]; } } cout<<"\n\nThe compressed string is : "; sLen=strlen(arr); for(x=0;x<sLen;x++) { for(y=0;y<len;y++) { if(sortArr[y]==(int)arr[x]) { for(z=0;z<counter[y];z++) { cout<<comp[y][z]; } } } } }
Page 33
LJIET
File content : citc_ Output : Char : c Prob : 0.4 Char : i Prob : 0.2 Char : t Prob : 0.2 Char : _ Prob : 0.2 c:0 i : 100 t : 101 _ : 11
Data Compression
VIII - IT
The compressed string is : 0100101011 Advantages Less compression computing complexity as compare to the Huffman coding. Require less bits then ASCII coding. Disadvantages Compression is not higher than other algorithm like Huffman coding, arithmetic coding and dictionary based coding. We have to transfer all the modeling information from encoding side to decoding side. Applications of Shannon Fano algorithm: Design and Implement software video,optical and mechanical configuration for acquisition and retrieval of electronic id card. Design patented Shannon Fano compression techniques to reduce photograph to 500 bytes and signature to 200 bytes.
Page 34
LJIET
VIII - IT
LJIET
} else {
Data Compression
VIII - IT
flag=0; } } if(flag==0) { k++; ch[k]=str[i]; } } cout<<"\n String2..."; for(i=0;i<=k;i++) { cout<<ch[i]; } getch(); } void sortf() { for(i=0;ch[i]!='\0';i++) { a=ch[i]; count=0; for(j=0;str[j]!='\0';j++) { if(a==str[j]) { count++; } } freq[m]=count; sum+=count; m++; } cout<<"\nTotal char...."<<sum; cout<<"\nchar freq "; for(i=0;i<=k;i++) { cout<<"\n"<<ch[i]<<"\t"<<freq[i]; } cout<<"\n****************************************************"; for(i=0;i<=k;i++) Page 36
LJIET
{
Data Compression
VIII - IT
for(j=i+1;j<=k;j++) { if(ch[i]>ch[j]) { temp=ch[i]; ch[i]=ch[j]; ch[j]=temp; temp1=freq[i]; freq[i]=freq[j]; freq[j]=temp1; } } } cout<<"\n sorting the freq......"; cout<<"\n char \t freq"; for(i=0;i<=k;i++) { cout<<"\n"<<ch[i]<<"\t"<<freq[i]; } cout<<"\n*****************************************************"; getch(); } void prob_range() { cout<<"\n Total Character....."<<sum; for(i=0;i<=k;i++) { p[i]=(double)freq[i]/sum; } r1[0]=0; r2[0]=p[0]; for(i=1;i<=k;i++) { r1[i]=r2[i-1]; r2[i]=r1[i]+p[i]; } getch(); cout<<"\n Char Freq Prob. Range"; for(i=0;i<=k;i++) cout<<"\n"<<ch[i]<<"\t"<<freq[i]<<"\t"<<p[i]<<"\t"<< r1[i]<<"<=r<"<<r2[i]; } void low_high() { double l_range(char); Page 37
LJIET
double h_range(char);
Data Compression
VIII - IT
double range,high_range[100],low_range[100],jj; for(i=0;str[i]!='\0';i++) { range=high-low; for(j=0;ch[j]!='\0';j++) { if(ch[j]==str[i]) break; } low_range[i]=low+range*r1[j]; high_range[i]=low+range*r2[j]; high=high_range[i]; low=low_range[i]; } getch(); cout<<"\n CHAR LOW HIGH"; for(i=0;str[i]!='\0';i++) { printf("\n%c\t%.12lf\t\t%.12lf",str[i], low_range[i],high_range[i]); } printf("\n \n The Encoded Number.........%.12lf",low); } double l_range(char c) { double l; for(i=0;ch[i]!='\0';i++) { if(ch[i]==c) { l=r1[i]; break; } } return l; } double h_range(char c) { double h; for(i=0;i<=k;i++) { if(ch[i]==c) Page 38
LJIET
{ h=r1[i]; break; } } return h;
Data Compression
VIII - IT
} void decoding() { double encode_no; double no; encode_no=low; printf("\n The Encode No...%.12lf",encode_no); printf("\n\nChar Encoded No Low High"); getch(); for(i=0;str[i]!='\0';i++) { for(j=0;ch[j]!='\0';j++) { if(str[i]==ch[j]) { printf("\n%c\t%.12lf\t\t%.2lf\t%.2lf", ch[j],encode_no,r1[j],r2[j]); encode_no=(encode_no-r1[j])/(r2[j]-r1[j]); } } } }
Page 39
LJIET
VIII - IT
LJIET
flag=0; if(i==0) {
Data Compression
VIII - IT
for(j=0;j<WindowIndex;j++) if(Window[j]==string[0]) { stack[top]=j; top++; flag=1; offset=j; } } else { int stacktmp[10],toptmp=0; for(j=0;j<top;j++) { if(Window[stack[j]+1]==string[i]) { flag=1; stacktmp[toptmp]=stack[j]+1; toptmp++; offset=stack[j]+1; } } for(j=0;j<toptmp;j++) stack[j]=stacktmp[j]; top=toptmp; } if(flag==0) { printf("%d %d %c\n",offset+2-i,i,Input[FileIndex]); Window[WindowIndex]=Input[FileIndex]; FileIndex++; WindowIndex++; break; } else { Window[WindowIndex]=Input[FileIndex]; WindowIndex++; FileIndex++; } } Page 41
LJIET
} end: getch(); }
Data Compression
VIII - IT
Page 42
LJIET Output
Data Compression
VIII - IT
A PROGRAM TO IMPLEMENT LZ77 ALGORITHM. ENTER THE STRING TO ENCODE (STRING SHOULDN'T EXCEED 82 CHARACTERS) abracadabra 2 2 2 1 4 1 0 0 0 1 1 3 a b r c d a
Page 43
LJIET
VIII - IT
LJIET
{
Data Compression
VIII - IT
l=lower+((upper-lower)*c[j-1].up_limit); upper=lower+((upper-lower)*c[j].up_limit); lower=l; printf("%f\t%f\n",lower,upper); } } } return((lower+upper)/2); } void tag_decoding(float tag,int cnt) { int i=0,j; float lower=0.0,upper=1.0,l; while(cnt!=0) { for(i=0;i<=MAX;i++) { if(tag>(lower+(upper-lower)*c[i].lo_limit)&&tag<(lower+(upperlower)*c[i].up_limit)) { printf("%c\t",c[i].symbol); l=lower+(upper-lower)*c[i-1].up_limit; upper=lower+(upper-lower)*c[i].up_limit; lower=l; printf("%f\t%f\t%f\n",upper,lower,upper-lower); cnt--; continue; } } } }
Page 45
LJIET
VIII - IT
LJIET
Data Compression
VIII - IT
string[i+1]='\0'; flag=0; if(i==0) { for(j=0;j<WindowIndex;j++) if(Window[j]==string[0]) { stack[top]=j; top++; flag=1; offset=j; } } else { int stacktmp[10],toptmp=0; for(j=0;j<top;j++) { if(Window[stack[j]+1]==string[i]) { flag=1; stacktmp[toptmp]=stack[j]+1; toptmp++; offset=stack[j]+1; } } for(j=0;j<toptmp;j++) stack[j]=stacktmp[j]; top=toptmp; } if(flag==0) { if(i==0) { printf(" 0 %23c\n",Input[FileIndex]); Window[WindowIndex++]= Input[FileIndex++]; break; } else { printf(" 1 %d %d %c\n", offset+2-i,i,Input[FileIndex]); Window[WindowIndex]=Input[FileIndex]; FileIndex++; WindowIndex++; Page 47
LJIET
Data Compression
break; } } else { Window[WindowIndex]=Input[FileIndex]; WindowIndex++; FileIndex++; } } } end: getch();
VIII - IT
Page 48
LJIET Output
Data Compression
VIII - IT
A PROGRAM TO IMPLEMENT LZSS ALGORITHM. ENTER THE STRING TO ENCODE (STRING SHOULDN'T EXCEED 82 CHARACTERS) abracadabra
BIT 0 0 0 1 1 1
OFFSET
1 4 1
COUNT NEXT_CHAR a b r 1 c 1 d 3 a
Page 49