본문 바로가기

UNIX_LINUX_C_C++

[펌] 허프만 코드를 이용한 압축

출처 : http://blog.naver.com/ksw7998/100011413622

사용자로부터 주어진 텍스트 파일을 압축하고 반대로 압축된 파일은 압축을 푸는 프로그램으로호프만 코드 알고리즘을 이용하여 호프만 테이블을 작성한다.

작성된 호프만 테이블로 입력된 텍스트 파일을 하나의 버퍼에 저장 후 파일에 출력 파일에 출력할 때 호프만 코드를 같이 출력한다.

파일 압축을 풀 때 하나의 버퍼에 파일의 내용을 저장 후 현재 프로그램의 버전과의 일치 여부와 코드 파일의 내용을 비교하면서 압축을 푼다.

참고로,2Mbyte파일 압축시75%정도 나온다.

// 사용 방법

압축 할때

실행파일명 -e[E] text.txt output.txt

압축 풀때

실행파일명 -d[D] output.txt text.txt

////////////////////// 소스 코드 ////////////////////////////////

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#define VERSION1.0

#define MAX_SIZE 256

#define CHARBITS 8

#define MAX_CODE_LENGTH16

#define MAX_CODE_NUMBER256

#define MAX_STRING_LENGTH 1024*100 // maximum size of compressed file is 100 kbyte

#define MAX_BIT_MASK_NUMBER 8

#define RIGHTBITS(n, x) ((x) & ((1U << (n)) - 1U))

typedef struct code_tag

{

int ascii;

char code[MAX_CODE_LENGTH];

int lengthOfCode;

unsigned frequency;

} CODE;

int HuffmanCodeTable(CODE codeTable[]);

void SetCode(CODE codeTable[], int cnt);

int EncryptString(char original_string[], int length_Of_Original_string,

CODE codeTable[], int cnt_of_Codes, char encoded_String[]);

void WriteBitString(char encoded_String[], int cnt_Of_EncodedBits);

int DecryptBitString(char encoded_String[], int cnt_Of_EncodeBit,

CODE codeTable[], int cnt_Of_Codes, char decoded_String[]);

int SearchAlphabet(char key, CODE codeTable[], int cnt_Of_Codes);

void AttachCode(char encoded_String[], int cnt_Of_Bits, char code[], int lengthOfCode);

int SearchCode(char key[], CODE codeTable[], int cnt_Of_Codes);

void WriteInFile(char encoded_String[], int cnt_Of_EncodedBits, CODE codeTable[], int cnt_of_codes);

void PrnFreq(CODE codeTable[], int cnt_of_codes);

void Error(char *msg);

void DownHeap(int i);

void GetTableSizeAndCodeBits(int *size_of_table, int *cnt_Of_EncodeBit);

int GetVersion();

void GetCodeTable(CODE codeTable[], int cnt_of_codes);

void GetCode(char encoded_string[]);

void PrnCodeTable(CODE codeTable[], int cnt_of_codes);

void WriteDecodedBit(char decoded_String[]);

char bitMask_OR[MAX_BIT_MASK_NUMBER];

char bitMask_AND[MAX_BIT_MASK_NUMBER];

int heap_size, heap[2*MAX_SIZE-1], parent[2*MAX_SIZE-1], left[2*MAX_SIZE-1], right[2*MAX_SIZE-1];

unsigned long int frequency[2*MAX_SIZE-1];

FILE *ifp, *ofp;

int main(int argc, char *argv[])

{

int i, j = 0;

char Original_String[MAX_STRING_LENGTH];

char encoded_String[MAX_STRING_LENGTH];

char decoded_String[MAX_STRING_LENGTH];

int lengthOfDecodedString;

int cnt_Of_EncodedBits = 0;

CODE codeTable[MAX_CODE_NUMBER];

int cnt_Of_Codes = 0;

char option;

if(argc != 4 || ((option = *argv[1]) != 'e' && option != 'E'

&& option != 'd' && option != 'D'))

{

printf("Usage : %s [e or E|d or D] input_file output_file\n", argv[0]);

exit(1);

}

ifp = fopen(argv[2], "rb");

if(ifp == NULL)

{

printf("Cannot open file : %s", argv[2]);

exit(1);

}

ofp = fopen(argv[3], "wb");

if(ofp == NULL)

{

printf("Cannot open file : %s", argv[3]);

exit(1);

}

for(i=0; i < 8; i++)

{

bitMask_OR[i] = 1 << (7-i);

bitMask_AND[i] = bitMask_OR[i] ^ 0xff;

}

if(option == 'e' || option == 'E')

{

while((i = getc(ifp)) != EOF)

{

Original_String[j++] = i;

if(j >= MAX_STRING_LENGTH)

Error("Error : larger than 100kbyte!\n");

}

Original_String[j]='\0';

rewind(ifp);

cnt_Of_Codes = HuffmanCodeTable(codeTable);

cnt_Of_EncodedBits = EncryptString(Original_String, strlen(Original_String), codeTable, cnt_Of_Codes, encoded_String);

WriteInFile(encoded_String, cnt_Of_EncodedBits, codeTable, cnt_Of_Codes);

}

else

{

if(GetVersion())

Error("Error : The file isn't equivalent the current program

version!\n");

GetTableSizeAndCodeBits(&cnt_Of_Codes, &cnt_Of_EncodedBits);

GetCodeTable(codeTable, cnt_Of_Codes);

GetCode(encoded_String);

lengthOfDecodedString = DecryptBitString(encoded_String, cnt_Of_EncodedBits, codeTable, cnt_Of_Codes, decoded_String);

WriteDecodedBit(decoded_String);

}

fclose(ifp);

fclose(ofp);

return 0;

}

int GetVersion()

{

int i, j = 0;

char buffer[10];

while((i = getc(ifp)) != EOF && j < 3)

{

buffer[j++] = i;

if(j >= MAX_STRING_LENGTH)

Error("Error : larger than 100kbyte!\n");

}

if(atof(buffer) == VERSION)

return 0;

else

return -1;

}

/*************************************************************

* 호프만 테이블 사이즈와 비트의 수 얻기

*

*************************************************************/

void GetTableSizeAndCodeBits(int *size_of_table, int *cnt_Of_EncodeBit)

{

int ch, i = 0;

char buffer[10];

while((ch = getc(ifp)) != EOF && ch != 0x2F)

buffer[i++] = ch;

buffer[i] = '\0';

*size_of_table = atoi(buffer);

i = 0;

while((ch = getc(ifp)) != EOF && ch != 0x20)

buffer[i++] = ch;

*cnt_Of_EncodeBit = atoi(buffer);

return;

}

/*************************************************************

* 호프만 코드 테이블을 메모리에 저장

*

*************************************************************/

void GetCodeTable(CODE codeTable[], int cnt_of_codes)

{

int i = 0, j = 0, tmp, is_block = 0;

while((tmp = getc(ifp)) != EOF || i < cnt_of_codes)

{

if(!is_block && tmp != '0' && tmp != '1')

{

codeTable[i].ascii = tmp;

}

else if(tmp == '0' || tmp == '1')

{

is_block = 1;

codeTable[i].code[j++] = tmp;

continue;

}

else if(is_block && tmp != '0' && tmp != '1')

{

codeTable[i].code[j] = '\0';

is_block = 0, j = 0;

if(cnt_of_codes == ++i)

break;

codeTable[i].ascii = tmp;

}

}

PrnCodeTable(codeTable, cnt_of_codes);

}

/*************************************************************

* 호프만 코드로 작성된 내용을 버퍼에 저장

*

*************************************************************/

void GetCode(char encoded_string[])

{

int ch, i = 0;

while((ch = getc(ifp)) != EOF)

{

encoded_string[i++] = ch;

if(i >= MAX_STRING_LENGTH)

Error("Error : larger than 100kbyte!\n");

}

encoded_string[i] = '\0';

}

void Error(char *msg)

{

printf("%s", msg);

exit(1);

}

void PrnFreq(CODE codeTable[], int cnt_of_codes)

{

int i, k = 0, size = 0;

printf("텍스트 분석 중...\n");

printf("---------------------------------\n");

printf("alphabet\tfrequency\n");

printf("---------------------------------\n");

for(i = 0; i <= cnt_of_codes; i++)

{

if((codeTable[i].ascii >= 65 && codeTable[i].ascii <= 90)

|| (codeTable[i].ascii >= 97 && codeTable[i].ascii <= 122))

{

printf("%c\t\t%d\n", codeTable[i].ascii, codeTable[i].frequency);

size += codeTable[i].frequency;

}

}

printf("---------------------------------\n");

printf("텍스트의 크기 : %d byte\n", size);

printf("---------------------------------\n\n\n");

}

/*************************************************************

* 버퍼에 저장된 호프만 코드를 사용 빈도수에 맞게 버퍼에 저장

*

*************************************************************/

void DownHeap(int i)

{

int j, k;

k = heap[i];

while ((j = 2 * i) <= heap_size)

{

if (j < heap_size && frequency[heap[j]] > frequency[heap[j + 1]])

j++;

if (frequency[k] <= frequency[heap[j]])

break;

heap[i] = heap[j];

i = j;

}

heap[i] = k;

}

/*************************************************************

* 호프만 알고리즘을 이용한 코드 테이블 작성

*

*************************************************************/

int HuffmanCodeTable(CODE codeTable[])

{

int i, j, k, avail, cnt_of_codes;

unsigned long int incount;

for(i = 0; i < MAX_SIZE; i++)

frequency[i] = 0;

while ((i = getc(ifp)) != EOF)

frequency[i]++;

heap[i] = 0;

heap_size = 0;

for(i = 0; i < MAX_SIZE; i++)

{

if(frequency[i] != 0)

heap[++heap_size] = i;

}

for(i = 1; i <= heap_size; i++)

{

codeTable[i-1].ascii = heap[i];

codeTable[i-1].frequency = frequency[heap[i]];

}

cnt_of_codes = heap_size;

PrnFreq(codeTable, cnt_of_codes);

for(i = heap_size / 2; i >= 1; i--)

DownHeap(i);

for(i = 0; i < 2 * MAX_SIZE - 1; i++)

parent[i] = 0;

k = heap[1];

avail = MAX_SIZE;

while (heap_size > 1)

{

i = heap[1];

heap[1] = heap[heap_size--];

DownHeap(1);

j = heap[1];

k = avail++;

frequency[k] = frequency[i] + frequency[j];

heap[1] = k;

DownHeap(1);

parent[i] = k;

parent[j] = -k;

left[k] = i;

right[k] = j;

}

incount = 0;

rewind(ifp);

SetCode(codeTable, cnt_of_codes);

return cnt_of_codes;

}

/*************************************************************

* 아스키 코드에 대한 코드 작성

*

*************************************************************/

void SetCode(CODE codeTable[], int cnt_of_codes)

{

int i, j, k, val = 0;

char temp[MAX_SIZE];

for(i = 0, j = 0, k = 0; i < cnt_of_codes; i++, j = 0, k = 0)

{

val = codeTable[i].ascii;

while ((val = parent[val]) != 0)

{

if (val > 0)

temp[j++] = '0';

else

{

temp[j++] = '1';

val = -val;

}

}

while(--j >= 0)

{

codeTable[i].code[k] = temp[j];

k++;

}

codeTable[i].code[k] = '\0';

codeTable[i].lengthOfCode = strlen(codeTable[i].code);

}

PrnCodeTable(codeTable, cnt_of_codes);

}

void PrnCodeTable(CODE codeTable[], int cnt_of_codes)

{

int i;

printf("Huffman code 생성 중...\n");

printf("---------------------------------\n");

printf("alphabet\tcode\n");

printf("---------------------------------\n");

for(i = 0; i < cnt_of_codes; i++)

if((codeTable[i].ascii >= 65 && codeTable[i].ascii <= 90) || (codeTable[i].ascii >= 97 && codeTable[i].ascii <= 122))

printf("%c\t\t%s\n", codeTable[i].ascii, codeTable[i].code);

printf("---------------------------------\n");

}

/*************************************************************

* 호프만 코드로 문자열 압축

*

*************************************************************/

int EncryptString(char original_String[], int length_Of_Original_String,

CODE codeTable[], int cnt_Of_Codes, char encoded_String[])

{

int i, pos, cnt_Of_Bits = 0;

for(i = 0; i < length_Of_Original_String; i++)

{

pos = SearchAlphabet(original_String[i], codeTable, cnt_Of_Codes);

if(pos >= 0)

{

AttachCode(encoded_String, cnt_Of_Bits, codeTable[pos].code, codeTable[pos].lengthOfCode);

cnt_Of_Bits += codeTable[pos].lengthOfCode;

}

else

{

printf("%c does not exist in code table!!\n", original_String[i]);

return -1;

}

}

return cnt_Of_Bits;

}

void WriteBitString(char encoded_String[], int cnt_Of_EncodedBits)

{

int i, pos = 0;

char a;

for(i=0; i < cnt_Of_EncodedBits; i++)

{

if(i%8==0)

{

putchar(' ');

a = encoded_String[pos++];

}

putchar(((a & bitMask_OR[0]) == 0) ? '0' : '1');

a <<= 1;

}

putchar('\n');

}

void WriteDecodedBit(char decoded_String[])

{

int i, j, length, val, tmp = 0, diff;

double progress = 0.0;

printf("압축 푸는 중 ...\n");

length = (int)strlen(decoded_String);

for(i =0; i < length; i++)

{

progress = (double)(i+1) / (double)length * 100.0;

if((int)progress%10 == 0 && (int)progress != 0 && (int)progress != 100)

{

if((val = (int)progress) != tmp)

{

if((diff = (val - tmp) / 10) > 1)

for(j = 1; j < diff; j++)

printf("%d %% 압축 품\n", tmp + j * 10);

printf("%d %% 압축 품\n", val);

tmp = val;

}

}

else if((int)progress/10 && (int)progress%10 > 5)

{

if((val = ((int)progress/10)*10) != tmp)

{

if((diff = (val - tmp) / 10) > 1)

for(j = 1; j < diff; j++)

printf("%d %% 압축 품\n", tmp + j * 10);

printf("%d %% 압축 품\n", val);

tmp = val;

}

}

else if((val = ((int)progress/10)*10) != tmp && (int)progress != 100)

{

if((diff = (val - tmp) / 10) > 1)

{

for(j = 1; j < diff; j++)

printf("%d %% 압축 품\n", tmp + j * 10);

}

printf("%d %% 압축 품\n", val);

tmp = val;

}

putc(decoded_String[i], ofp);

}

printf("완료.\n");

}

void WriteInFile(char encoded_String[], int cnt_Of_EncodedBits, CODE codeTable[],

int cnt_of_codes)

{

int i, j, ch = 0, incount, outcount, pos = 0, val, tmp = 0, diff;

double version = VERSION, progress = 0.0;

char *p, buffer[MAX_STRING_LENGTH];

printf("압축 중 ...\n");

fseek(ifp, 0L, SEEK_END);

incount = ftell(ifp);

// write a version

sprintf(buffer, "%.1f ", version);

for(i =0; i < (int)strlen(buffer); i++)

putc(buffer[i], ofp);

// write a size of the code tables

sprintf(buffer, "%d", cnt_of_codes);

for(i =0; i < (int)strlen(buffer); i++)

putc(buffer[i], ofp);

sprintf(buffer, "/%d ", cnt_Of_EncodedBits);

for(i =0; i < (int)strlen(buffer); i++)

putc(buffer[i], ofp);

// write contents of the code tables

for(i = 0; i < cnt_of_codes; i++)

{

putc(codeTable[i].ascii, ofp);

for(j = 0; j < codeTable[i].lengthOfCode; j++)

putc(codeTable[i].code[j], ofp);

}

putc(' ', ofp);

// write contents of compressed file

p = encoded_String;

for(i=0; (i < cnt_Of_EncodedBits) && (ch != EOF) && (*p != '\0'); i++)

{

progress = (double)i / (double)cnt_Of_EncodedBits * 100.0;

if((int)progress%10 == 0 && (int)progress != 0 && (int)progress != 100)

{

if((val = (int)progress) != tmp)

{

if((diff = (val - tmp) / 10) > 1)

{

for(j = 1; j < diff; j++)

printf("%d %% 압축\n", tmp + j * 10);

}

printf("%d %% 압축\n", val);

tmp = val;

}

}

else if((int)progress/10 && (int)progress%10 > 5)

{

if((val = ((int)progress/10)*10) != tmp)

{

if((diff = (val - tmp) / 10) > 1)

{

for(j = 1; j < diff; j++)

printf("%d %% 압축\n", tmp + j * 10);

}

printf("%d %% 압축\n", val);

tmp = val;

}

}

else if((val = ((int)progress/10)*10) != tmp && (int)progress != 100)

{

if((diff = (val - tmp) / 10) > 1)

{

for(j = 1; j < diff; j++)

printf("%d %% 압축\n", tmp + j * 10);

}

printf("%d %% 압축\n", val);

tmp = val;

}

if(i%8==0)

ch = putc( p[pos++], ofp );

}

printf("압축 완료.\n");

outcount = ftell(ofp);

printf("---------------------------------\n");

printf("텍스트 파일 크기 : %u bytes\n", incount);

printf("압축된 파일 크기 : %u bytes\n", outcount);

printf("압축 비율 : %.3f(%u/%u - 1)\n", ((double)incount / (double)outcount - 1.0)

, incount, outcount);

}

/*************************************************************

* 호프만 코드로 문자열 압축 풀기

*

*************************************************************/

int DecryptBitString(char encoded_String[], int cnt_Of_EncodeBit,

CODE codeTable[], int cnt_Of_Codes, char decoded_String[])

{

int i, index, pos=0, length_Of_DecodedString=0;

char code[MAX_CODE_LENGTH] = {'\0'};

char a;

for (i = 0; i < cnt_Of_EncodeBit; i++)

{

if(i%8 ==0)

a=encoded_String[pos++];

if(a & bitMask_OR[0])

strcat(code, "1");

else

strcat(code, "0");

a <<= 1;

index=SearchCode(code, codeTable, cnt_Of_Codes);

if(index != -1)

{

decoded_String[length_Of_DecodedString] = codeTable[index].ascii;

length_Of_DecodedString++;

code[0] = '\0';

}

}

decoded_String[length_Of_DecodedString] = '\0';

return length_Of_DecodedString;

}

/*************************************************************

* 호프만 테이블 내의 주어진 알파벳 문자의 인덱스 찾기

*

*************************************************************/

int SearchAlphabet(char key, CODE codeTable[], int cnt_Of_Codes)

{

int i, index = -1;

for (i = 0; i <cnt_Of_Codes; i++)

{

if(codeTable[i].ascii == key)

{

index=i;

break;

}

}

return index;

}

/*************************************************************

* 호프만 테이블 내의 주어진 코드의 인덱스 찾기

*

*************************************************************/

int SearchCode(char key[], CODE codeTable[], int cnt_Of_Codes)

{

int i, index=-1;

for (i = 0; i < cnt_Of_Codes; i++)

{

if(strcmp(codeTable[i].code, key) == 0)

{

index=i;

break;

}

}

return index;

}

/*************************************************************

*하나의 버퍼에 호프만 코드 저장

*

*************************************************************/

void AttachCode(char encoded_String[], int cnt_Of_Bits, char code[],

int lengthOfCode)

{

int i, lengthOfEncodedString, lastBitPosition;

lengthOfEncodedString = cnt_Of_Bits/8;

lastBitPosition = cnt_Of_Bits%8;

for (i = 0; i < lengthOfCode; i++)

{

if(code[i]=='1')

encoded_String[lengthOfEncodedString] |= bitMask_OR[lastBitPosition];

else

encoded_String[lengthOfEncodedString] &= bitMask_AND[lastBitPosition];

lastBitPosition++;

if(lastBitPosition == 8)

{

lastBitPosition = 0;

lengthOfEncodedString = lengthOfEncodedString + 1;

}

}

}

'UNIX_LINUX_C_C++' 카테고리의 다른 글

[펌] LZ 압축 알고리즘  (0) 2011.10.16
[펌] ZIP 알고리즘  (0) 2011.10.16
[펌] 압축 알고리즘 소스 및 정리  (0) 2011.10.16
[펌] SLZ 압축 소스  (0) 2011.10.16
[펌] 공유 메모리(shared memory) 의 사용  (0) 2011.10.14