The ultimate in digital reductionism. Unmatched data compression ratio
Through keeping track of variables used as keys for decompression, encoding those, the data is thoroughly "scrambled"
#!/usr/bin/env python3
#Sequitur Compression program
#Theory: read file byte at a time. Check to see if the newest pair of bytes are duplicate to any other pair.
#When pair of pairs is found, create a rule, place rule in A register. Rule will reduce a pair of chars to a single char in S register
#When scanning for pairs, need to check if pairs exists in S register, OR in A register where previous pairs have been moved for encoding
#If pair is found in A register, a single 2 for 1 reduction occurrs in S (rather than a double if both sets of pairs are in S)
#File will be read as binary, and subst characters need to be distinct from original data, so beginSubValue should be chosen above the maximum byte value in the file.
#The encoded file will be UTF-8 encoded binary, beginning with ascii numbers for the final lengh of S, A registers, begin value of A, separated by delimiting chars S, A and B, followed by the data contained in S, finally the tupples from A
#Usage of the variables 'S' and 'A' are because those were used in the examples while researching this project
print('\n%\nBegin Sequitur data compression Program')
for g in range(0,30):
print('-+',end="")
print('\n\/This program produces an altered file based on the Sequitur algorithm')
print('Ideally it compresses the file to occupy less space')
print('You may wish to edit beginSubValue in the code. This will change the compression efficiency, but the value must be greater than the largest databyte in your file read as values from 0-255')
print('Test results using beginSubValue=128 for pain text occasionally results in errors')
print('this is not a time-efficient algorithm, so recommend compressing files < 100k')
#infile='Aeneid.txt'
infile=input('type name of file for compression/encryption: ') #activate this line if you'd like to type in your filename
fr=open(infile,'rb') #create handle on file for reading
a=fr.read() #copy file contents to a variable
fr.close() #close file
infileSize=len(a)
#a=a[0:50000] #for testing only: grab just a sample segment of the file
infileReduced=len(a) #if segment is used, later a message will be displayed
beginSubValue=256 #more efficient with lower number. Should be > largest data byte
#decode program might work with higher UTF than 8, but this design tested UTF-8 which allows 2^32 (4294967296)
subSym=[q for q in range(beginSubValue,65535)] #May need to increase upper limit for large files. Max size 2^32 using UTF-8
SSymIdx=0 #points at next symbol to be used, in subSym static list of available symbols
#Static list in this case is contiguous, but the list could be any unique numbers so long as they're distinct from the data bytes
S=list(a[0:3]) #grab 3 bytes to start
A=[[],[]] #substitution array = substitution rules. A[0]: list of tupples. A[1]: list of values
SRuleIdx=0 #for keeping track of how many rules have been created, and for scanning the rule list
SDupe=0 #count of times any and all rule(s) is(are) utilized duplicate times
print('Compressing ',len(a),' characters\n\n','Working.',end="")
j=0
r=0
while r<len(a)-3: #offset for next bype grab out of a register
S.append(a[r+3]) #grab next char for evaluation
#print('\n>->--- Grab character number --->->-',r,end=" ")
#print('\nS Before',len(S),S,'\nA Before',len(A[1]),A)
#////Check Sub Rules already created
j=0
#print('Scanning subRuleReg....from S: ',S[r+2-2*SRuleIdx-SDupe],' , ',S[r+3-2*SRuleIdx-SDupe])
while j < len(A[1]):
#print('j',j,'LSB ',A[0][j][0],' MSB ',A[0][j][1])
if S[r+2-2*SRuleIdx-SDupe]==A[0][j][0] and S[r+3-2*SRuleIdx-SDupe]==A[0][j][1]:
#print('A[0][j][0]=',A[0][j][0],' A[0][j][1]=',A[0][j][1])
#print('Dupes:',S[r+2-2*SRuleIdx-j-SDupe],S[r+3-2*SRuleIdx-j-SDupe])
S.pop()
S.pop()
S.append(A[1][j])
SDupe=SDupe+1
#print('match in subst reg!!!','... SRuleIdx=',SRuleIdx,' SDupe=',SDupe,'\nA:',len(A[1]),A)
#print('S',len(S),S)
j=j+1
#print('Done with Sub Rule scan S',len(S),S)
#////Check the rest of the S register
i=0
while i < len(S)-3:
#print('i loop...',i,end="")
#print('.....Test',0+i,1+i,r+2-2*SRuleIdx-SDupe,r+3-2*SRuleIdx-SDupe)
if S[0+i]==S[r+2-2*SRuleIdx-SDupe] and S[1+i]==S[r+3-2*SRuleIdx-SDupe]:
#print('>+>+>its a match at','r',r,' i',i,end="")
#print('matches - Create New Rule: ',S[0+i],S[1+i],S[r+2-2*SRuleIdx-SDupe],S[r+3-2*SRuleIdx-SDupe],'
#at rule location SSymIndex=',SRuleIdx)
A[0].append([S[i],S[i+1]]) #get bytes out of S register that will get encoded as rules in A register
A[1].append(subSym[SSymIdx]) #put rule in A register
SSymIdx=SSymIdx+1
S.pop(i+1) #purge 2 pairs of symbols (4 pops)
S.pop(i)
S.pop()
S.pop()
S.insert(i,A[1][SRuleIdx]) #place the substituted bytes in the
S.append(A[1][SRuleIdx]) #locations where they go
SRuleIdx=SRuleIdx+1 #get ready for next sub rule
#print(' SRuleIdx',SRuleIdx,' A Value @ A[1][SRuleIdx]=',A[1][SRuleIdx])
#print('A Modified',len(A[1]),A,'\nS Modified ',len(S),S)
break #only need to scan until/if a dupe pair is found, since any previous dupes would have already been encoded
i=i+1
r=r+1
if r%100 == 0: #Progress bar
print('.',end="",flush=True)
if r%1000 == 0:
print(len(a)-r,' ','length subst reg ',len(A[1]))
while(len(S)<4): #in case you get fewer than 4 bytes in S register, grab the next byte
r=r+1
S.append(a[r+4])
print('appended extra')
print('\n\n+++scanning complete+++\n')
outfile=infile+'.sqc' #name for compressed file
#convert data to ascii in preparation for writing to file
filedataS=''
print('\nCompressed data as ascii integers\n:\n ')
for e in range(0,len(S)):
filedataS=filedataS+chr((S[e]))
print(S[e],end=" ")
print('\n\nS register ready for encoding\n:\n',filedataS,'\n\n')
filedataA=''
for f in range(0,len(A[1])): #need to keep track of A[1] length and start value, ie subSym
filedataA=filedataA+chr((A[0][f][0])) #need A[0] data tupples [0]
filedataA=filedataA+chr((A[0][f][1])) #need A[0] data tupples [1]
print('A subst register ready for encoding\n:\n',filedataA,'\n\n')
#prepare compressed ascii data to write to file, starting with list sizes and keys for delineating data groups, 'S', 'A' and 'B'
outfileData=str(len(S))+'S'+str(len(A[1]))+'A'+str(beginSubValue)+'B'+filedataS+filedataA
outfileData=outfileData.encode('UTF-8') #encode and write data to binary file
fw=open(outfile,'wb')
fw.write(outfileData)
fw.close
#compression summary and statistics
print('A',len(A[1]),A)
print('S',len(S),S,'\n')
print('New rules in A register',len(A[1]),' Rules, number times recycled =',SDupe,'\n\n')
#print('S',len(S),S,'\n')
print('Theoretical Compression Gain/Symbol Count Reduction =',infileReduced-2*len(A[1])-len(S)-3,' Comp gain ratio ',infileReduced/(2*len(A[1])+len(S))+3)
print('wrote to file. UTF-8 encoded data\nhere''s a sample:\n',outfileData[0:60],'.....\n\n')
print(len(S),'final lenght of S register')
print(len(A[1]),'final length of rule register')
if infileSize>infileReduced:
print('only a sample of data was captured')
print(infileReduced,'Bytes in original sample')
print(len(outfileData),'Bytes written to compressed file\nCompression Gain Ratio: OutputFileSize/InputFileSize\n',len(outfileData)/infileReduced,'\n')
#The following code goes in the Decode program - originally run here for testing how the encoding works
#decodeExpt=outfileData.decode("UTF-8")
#print("Experimental decode")
#for i in range(len(decodeExpt)):
# print(ord(decodeExpt[i]),end=" ")
sequiturRead (py)
Download