Wrum, wrum, wrum. Java is awesome at parsing binary files.

master
Justyna Ilczuk 2013-04-06 15:46:05 +02:00
parent 0252245893
commit c3df7b0c97
8 changed files with 737 additions and 0 deletions

View File

@ -0,0 +1,23 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package mobireader;
import nl.flotsam.preon.annotation.BoundNumber;
/**
*
* @author att
*/
public class EXTHHeader {
//headerfmt = '>III'
@BoundNumber(size="32")
long identifier;
@BoundNumber(size="32")
long headerLength;
@BoundNumber(size="32")
long recordCount;
}

View File

@ -0,0 +1,29 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package mobireader;
import nl.flotsam.preon.annotation.BoundNumber;
import nl.flotsam.preon.annotation.BoundString;
/**
*
* @author att
*/
public class Header {
public @BoundString(size="32") String name;
@BoundNumber int attributes;
@BoundNumber int version;
@BoundNumber int created;
@BoundNumber int modified;
@BoundNumber int backup;
@BoundNumber int modnum;
@BoundNumber int appInfoId;
@BoundNumber int sortInfoId;
@BoundString(size="4") String type;
@BoundString(size="4") String creator;
@BoundNumber int uniqueIDseed;
@BoundNumber int nextRecordListId;
@BoundNumber int number_of_records;
}

View File

@ -0,0 +1,172 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package mobireader;
import com.google.common.io.Files;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOError;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.util.HashMap;
import nl.flotsam.preon.Codec;
import nl.flotsam.preon.Codecs;
import nl.flotsam.preon.DecodingException;
import nl.flotsam.preon.annotation.BoundNumber;
import nl.flotsam.preon.annotation.BoundString;
/**
*
* @author att
*/
public class Mobi {
File file;
int offset = 0;
String contents;
Header header;
public Mobi(String filename)
{
try {
this.file = new File(filename);
}
catch (IOError e) {
throw e;
}
}
void parse() throws IOException
{
byte compressed [] = Files.toByteArray(this.file);
this.contents = new String(compressed);
this.header = parseHeader();
//this.records = self.parseRecordInfoList();
//this..readRecord0()
}
public int calcsize(String headerFormat)
{
int size = 0;
boolean is_number = false;
String number = "";
String type = "";
for(int i = 0; i < headerFormat.length(); i++)
{
char c = headerFormat.charAt(i);
if(Character.isDigit(c))
{
number += c;
is_number = true;
}
else if (is_number){
size += addNumberOfBytes(Integer.parseInt(number), c);
is_number = false;
number = "";
}
else {
size += addNumberOfBytes(1, c);
}
}
return size;
}
int addNumberOfBytes(int n, char c)
{
int base;
switch (c) {
case 'c': base = 1;
break;
case 's': base = 1;
break;
case 'b': base = 1;
break;
case 'h': base = 2;
break;
case 'H': base = 2;
break;
case 'i': base = 4;
break;
case 'I': base = 4;
break;
case 'l': base = 4;
break;
case 'L': base = 4;
break;
case 'f': base = 4;
break;
case 'd': base = 8;
break;
default: base = 0;
break;
}
return base * n;
}
Header parseHeader(){
String headerfmt = "32shhIIIIII4s4sIIH";
int headerlen = calcsize(headerfmt);
String headerData = this.contents.substring(this.offset,
this.offset+headerlen);
Header parsedHeader = new Header(); //createHeaderBasedOn(headerData);
/*
# unpack header, zip up into list of tuples
results = zip(fields, unpack(headerfmt, self.contents[self.offset:self.offset+headerlen]))
# increment offset into file
this.offset += headerlen;
# convert tuple array to dictionary
resultsDict = utils.toDict(results);
*/
return parsedHeader;
}
public Header createHeaderBasedOn(File file)
{
Header headerFromText;
try {
Codec<Header> codec = Codecs.create(Header.class);
headerFromText = Codecs.decode(codec, file);
}
catch( IOException e ){
System.out.println(e.getCause());
headerFromText = new Header();
}
catch (DecodingException e)
{
System.out.println(e.getCause());
headerFromText = new Header();
}
return headerFromText;
}
/*
def readRecord(self, recordnum, disable_compression=False):
if self.config:
if self.config['palmdoc']['Compression'] == 1 or disable_compression:
return self.contents[self.records[recordnum]['record Data Offset']:self.records[recordnum+1]['record Data Offset']];
elif self.config['palmdoc']['Compression'] == 2:
result = uncompress_lz77(self.contents[self.records[recordnum]['record Data Offset']:self.records[recordnum+1]['record Data Offset']-self.config['mobi']['extra bytes']])
return result
def readImageRecord(self, imgnum):
if self.config:
recordnum = self.config['mobi']['First Image index'] + imgnum;
return self.readRecord(recordnum, disable_compression=True);
def author(self):
"Returns the author of the book"
return self.config['exth']['records'][100]
def title(self):
"Returns the title of the book"
return self.config['mobi']['Full Name']
*/
}

View File

@ -0,0 +1,92 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package mobireader;
import nl.flotsam.preon.annotation.BoundNumber;
import nl.flotsam.preon.annotation.BoundString;
/**
*
* @author att
*/
public class MobiHeader {
//headerfmt = '> IIII II 40s III IIIII IIII I 36s IIII 8s HHIIIII'
@BoundNumber(size="32")
long identifier;
@BoundNumber(size="32")
long header_length;
@BoundNumber(size="32")
long Mobi_type;
@BoundNumber(size="32")
long text_Encoding;
@BoundNumber(size="32")
long Unique_ID;
@BoundNumber(size="32")
long Generator_version;
@BoundString(size="40")
String Reserved;
@BoundNumber(size="32")
long FirstNonBookIndex;
@BoundNumber(size="32")
long FullNameOffset;
@BoundNumber(size="32")
long FullNameLength;
@BoundNumber(size="32")
long Language;
@BoundNumber(size="32")
long InputLanguage;
@BoundNumber(size="32")
long OutputLanguage;
@BoundNumber(size="32")
long FormatVersion;
@BoundNumber(size="32")
long FirstImageIndex;
@BoundNumber(size="32")
long FirstHuffRecord;
@BoundNumber(size="32")
long HuffRecordCount;
@BoundNumber(size="32")
long FirstDATPRecord;
@BoundNumber(size="32")
long DATPRecordCount;
@BoundNumber(size="32")
long EXTHFlags;
@BoundString(size="36")
String unknown36Bytes;
@BoundNumber(size="32")
long DRMOffset;
@BoundNumber(size="32")
long DRMCount;
@BoundNumber(size="32")
long DRMSize;
@BoundNumber(size="32")
long DRMFlags;
@BoundString(size="8")
String unknown8Bytes;
@BoundNumber(size="16")
int Unknown1;
@BoundNumber(size="16")
int LastImageRecord;
@BoundNumber(size="32")
long Unknown2;
@BoundNumber(size="32")
long FCISRecord;
@BoundNumber(size="32")
long Unknown3;
@BoundNumber(size="32")
long FLISRecord;
@BoundNumber(size="32")
long Unknown4;
}

View File

@ -34,5 +34,6 @@ public class MobiReader extends Application {
public static void main(String[] args) {
launch(args);
}
}

View File

@ -0,0 +1,30 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package mobireader;
import nl.flotsam.preon.annotation.BoundNumber;
/**
*
* @author att
*/
public class PalmDocHeader {
//headerfmt = '>HHIHHHH'
@BoundNumber(size="16")
int Compression;
@BoundNumber(size="16")
int Unused;
@BoundNumber(size="32")
long textLength;
@BoundNumber(size="16")
int recordCount;
@BoundNumber(size="16")
int recordSize;
@BoundNumber(size="16")
int encryptionType;
@BoundNumber(size="16")
int unknown;
}

View File

@ -0,0 +1,289 @@
/*
* The MIT License
*
* Copyright (c) 2009 Olle Törnström studiomediatech.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
* CREDIT: Initially implemented by Diogo Kollross and made publicly available
* on the website http://www.geocities.com/diogok_br/lz77. Edited here
* to provide two flavours for JavaScript usage, either as standalone
* compressor/decompressor or as class for copy/paste use.
*/
/**
* This class provides simple LZ77 compression and decompression.
*
* USAGE: Place in your own project package of choice adding preferred package
* setting.
*
* @author Olle Törnström olle[at]studiomediatech[dot]com
* @created 2009-02-18
*/
package unzipping;
public class LZ77 {
private char referencePrefix;
private int referenceIntBase;
private int referenceIntFloorCode;
private int referenceIntCeilCode;
private int maxStringDistance;
private int minStringLength;
private int maxStringLength;
private int defaultWindowLength;
private int maxWindowLength;
// CONSTRUCTOR
public LZ77() {
referencePrefix = '`';
referenceIntBase = 96;
referenceIntFloorCode = (int) ' ';
referenceIntCeilCode = referenceIntFloorCode + referenceIntBase;
maxStringDistance = (int) Math.pow(referenceIntBase, 2) - 1;
minStringLength = 5;
maxStringLength = (int) Math.pow(referenceIntBase, 1) - 1
+ minStringLength;
defaultWindowLength = 144;
maxWindowLength = maxStringDistance + minStringLength;
}
// LAZY STATIC METHODS - ADDED BY: DAN!
public static String compressStr(String data) {
LZ77 lz = new LZ77();
return lz.compress(data, null);
}
public static String decompressStr(String data) {
LZ77 lz = new LZ77();
return lz.decompress(data);
}
// PUBLIC METHODS
/**
* Compress string data using the LZ77 algorithm.
*
* @param data
* String data to compress
* @return LZ77 compressed string
*/
public String compress(String data) {
return compress(data, null);
}
/**
* Compress string data using the LZ77 algorithm.
*
* @param data
* String data to compress
* @param windowLength
* Optional window length
* @return LZ77 compressed string
*/
public String compress(String data, Integer windowLength) {
if (windowLength == null)
windowLength = defaultWindowLength;
if (windowLength > maxWindowLength)
throw new IllegalArgumentException("Window length too large");
String compressed = "";
int pos = 0;
int lastPos = data.length() - minStringLength;
while (pos < lastPos) {
int searchStart = Math.max(pos - windowLength, 0);
int matchLength = minStringLength;
boolean foundMatch = false;
int bestMatchDistance = maxStringDistance;
int bestMatchLength = 0;
String newCompressed = null;
while ((searchStart + matchLength) < pos) {
int sourceWindowEnd = Math.min(searchStart + matchLength, data
.length());
int targetWindowEnd = Math
.min(pos + matchLength, data.length());
String m1 = data.substring(searchStart, sourceWindowEnd);
String m2 = data.substring(pos, targetWindowEnd);
boolean isValidMatch = m1.equals(m2)
&& matchLength < maxStringLength;
if (isValidMatch) {
matchLength++;
foundMatch = true;
} else {
int realMatchLength = matchLength - 1;
if (foundMatch && (realMatchLength > bestMatchLength)) {
bestMatchDistance = pos - searchStart - realMatchLength;
bestMatchLength = realMatchLength;
}
matchLength = minStringLength;
searchStart++;
foundMatch = false;
}
}
if (bestMatchLength != 0) {
newCompressed = referencePrefix
+ encodeReferenceInt(bestMatchDistance, 2)
+ encodeReferenceLength(bestMatchLength);
pos += bestMatchLength;
} else {
if (data.charAt(pos) != referencePrefix) {
newCompressed = "" + data.charAt(pos);
} else {
newCompressed = "" + referencePrefix + referencePrefix;
}
pos++;
}
compressed += newCompressed;
}
return compressed + data.substring(pos).replaceAll("/`/g", "``");
}
public String decompress(String data) {
String decompressed = "";
int pos = 0;
while (pos < data.length()) {
char currentChar = data.charAt(pos);
if (currentChar != referencePrefix) {
decompressed += currentChar;
pos++;
} else {
char nextChar = data.charAt(pos + 1);
if (nextChar != referencePrefix) {
int distance = decodeReferenceInt(data.substring(pos + 1,
pos + 3), 2);
int length = decodeReferenceLength(data.substring(pos + 3,
pos + 4));
int start = decompressed.length() - distance - length;
int end = start + length;
decompressed += decompressed.substring(start, end);
pos += minStringLength - 1;
} else {
decompressed += referencePrefix;
pos += 2;
}
}
}
return decompressed;
}
// PRIVATE METHODS
private String encodeReferenceInt(int value, int width) {
if ((value >= 0) && (value < (Math.pow(referenceIntBase, width) - 1))) {
String encoded = "";
while (value > 0) {
char c = (char) ((value % referenceIntBase) + referenceIntFloorCode);
encoded = "" + c + encoded;
value = (int) Math.floor(value / referenceIntBase);
}
int missingLength = width - encoded.length();
for (int i = 0; i < missingLength; i++) {
char c = (char) referenceIntFloorCode;
encoded = "" + c + encoded;
}
return encoded;
} else {
throw new IllegalArgumentException("Reference int out of range: "
+ value + " (width = " + width + ")");
}
}
private String encodeReferenceLength(int length) {
return encodeReferenceInt(length - minStringLength, 1);
}
private int decodeReferenceInt(String data, int width) {
int value = 0;
for (int i = 0; i < width; i++) {
value *= referenceIntBase;
int charCode = (int) data.charAt(i);
if ((charCode >= referenceIntFloorCode)
&& (charCode <= referenceIntCeilCode)) {
value += charCode - referenceIntFloorCode;
} else {
throw new RuntimeException(
"Invalid char code in reference int: " + charCode);
}
}
return value;
}
private int decodeReferenceLength(String data) {
return decodeReferenceInt(data, 1) + minStringLength;
}
}

View File

@ -0,0 +1,101 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package unzipping;
/**
*
* @author att
*/
public class lz77Unzipper {
}
/*
import struct
# ported directly from the PalmDoc Perl library
# http://kobesearch.cpan.org/htdocs/EBook-Tools/EBook/Tools/PalmDoc.pm.html
def uncompress_lz77(data):
length = len(data);
offset = 0; # Current offset into data
# char; # Character being examined
# ord; # Ordinal of $char
# lz77; # 16-bit Lempel-Ziv 77 length-offset pair
# lz77offset; # LZ77 offset
# lz77length; # LZ77 length
# lz77pos; # Position inside $lz77length
text = ''; # Output (uncompressed) text
# textlength; # Length of uncompressed text during LZ77 pass
# textpos; # Position inside $text during LZ77 pass
while offset < length:
# char = substr($data,$offset++,1);
char = data[offset];
offset += 1;
ord_ = ord(char);
# print " ".join([repr(char), hex(ord_)])
# The long if-elsif chain is the best logic for $ord handling
## no critic (Cascading if-elsif chain)
if (ord_ == 0):
# Nulls are literal
text += char;
elif (ord_ <= 8):
# Next $ord bytes are literal
text += data[offset:offset+ord_] # text .=substr($data,$offset,ord);
offset += ord_;
elif (ord_ <= 0x7f):
# Values from 0x09 through 0x7f are literal
text += char;
elif (ord_ <= 0xbf):
# Data is LZ77-compressed
# From Wikipedia:
# "A length-distance pair is always encoded by a two-byte
# sequence. Of the 16 bits that make up these two bytes,
# 11 bits go to encoding the distance, 3 go to encoding
# the length, and the remaining two are used to make sure
# the decoder can identify the first byte as the beginning
# of such a two-byte sequence."
offset += 1;
if (offset > len(data)):
print("WARNING: offset to LZ77 bits is outside of the data: %d" % offset);
return text;
lz77, = struct.unpack('>H', data[offset-2:offset])
# Leftmost two bits are ID bits and need to be dropped
lz77 &= 0x3fff;
# Length is rightmost 3 bits + 3
lz77length = (lz77 & 0x0007) + 3;
# Remaining 11 bits are offset
lz77offset = lz77 >> 3;
if (lz77offset < 1):
print("WARNING: LZ77 decompression offset is invalid!");
return text;
# Getting text from the offset is a little tricky, because
# in theory you can be referring to characters you haven't
# actually decompressed yet. You therefore have to check
# the reference one character at a time.
textlength = len(text);
for lz77pos in range(lz77length): # for($lz77pos = 0; $lz77pos < $lz77length; $lz77pos++)
textpos = textlength - lz77offset;
if (textpos < 0):
print("WARNING: LZ77 decompression reference is before"+
" beginning of text! %x" % lz77);
return;
text += text[textpos:textpos+1]; #text .= substr($text,$textpos,1);
textlength+=1;
else:
# 0xc0 - 0xff are single characters (XOR 0x80) preceded by
# a space
text += ' ' + chr(ord_ ^ 0x80);
return text;
*/