From c3df7b0c9778533c6fcbf98e28f215d0f3910ede Mon Sep 17 00:00:00 2001 From: Justyna Ilczuk Date: Sat, 6 Apr 2013 15:46:05 +0200 Subject: [PATCH] Wrum, wrum, wrum. Java is awesome at parsing binary files. --- MobiReader/src/mobireader/EXTHHeader.java | 23 ++ MobiReader/src/mobireader/Header.java | 29 ++ MobiReader/src/mobireader/Mobi.java | 172 +++++++++++ MobiReader/src/mobireader/MobiHeader.java | 92 ++++++ MobiReader/src/mobireader/MobiReader.java | 1 + MobiReader/src/mobireader/PalmDocHeader.java | 30 ++ MobiReader/src/unzipping/LZ77.java | 289 +++++++++++++++++++ MobiReader/src/unzipping/lz77Unzipper.java | 101 +++++++ 8 files changed, 737 insertions(+) create mode 100644 MobiReader/src/mobireader/EXTHHeader.java create mode 100644 MobiReader/src/mobireader/Header.java create mode 100644 MobiReader/src/mobireader/Mobi.java create mode 100644 MobiReader/src/mobireader/MobiHeader.java create mode 100644 MobiReader/src/mobireader/PalmDocHeader.java create mode 100644 MobiReader/src/unzipping/LZ77.java create mode 100644 MobiReader/src/unzipping/lz77Unzipper.java diff --git a/MobiReader/src/mobireader/EXTHHeader.java b/MobiReader/src/mobireader/EXTHHeader.java new file mode 100644 index 0000000..8a6055b --- /dev/null +++ b/MobiReader/src/mobireader/EXTHHeader.java @@ -0,0 +1,23 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ +package mobireader; + +import nl.flotsam.preon.annotation.BoundNumber; + +/** + * + * @author att + */ +public class EXTHHeader { + //headerfmt = '>III' + @BoundNumber(size="32") + long identifier; + @BoundNumber(size="32") + long headerLength; + @BoundNumber(size="32") + long recordCount; + + +} diff --git a/MobiReader/src/mobireader/Header.java b/MobiReader/src/mobireader/Header.java new file mode 100644 index 0000000..1d0623f --- /dev/null +++ b/MobiReader/src/mobireader/Header.java @@ -0,0 +1,29 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ +package mobireader; + +import nl.flotsam.preon.annotation.BoundNumber; +import nl.flotsam.preon.annotation.BoundString; + +/** + * + * @author att + */ +public class Header { + public @BoundString(size="32") String name; + @BoundNumber int attributes; + @BoundNumber int version; + @BoundNumber int created; + @BoundNumber int modified; + @BoundNumber int backup; + @BoundNumber int modnum; + @BoundNumber int appInfoId; + @BoundNumber int sortInfoId; + @BoundString(size="4") String type; + @BoundString(size="4") String creator; + @BoundNumber int uniqueIDseed; + @BoundNumber int nextRecordListId; + @BoundNumber int number_of_records; +} \ No newline at end of file diff --git a/MobiReader/src/mobireader/Mobi.java b/MobiReader/src/mobireader/Mobi.java new file mode 100644 index 0000000..0a349db --- /dev/null +++ b/MobiReader/src/mobireader/Mobi.java @@ -0,0 +1,172 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ +package mobireader; + +import com.google.common.io.Files; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOError; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.util.HashMap; +import nl.flotsam.preon.Codec; +import nl.flotsam.preon.Codecs; +import nl.flotsam.preon.DecodingException; +import nl.flotsam.preon.annotation.BoundNumber; +import nl.flotsam.preon.annotation.BoundString; +/** + * + * @author att + */ + + + +public class Mobi { + File file; + int offset = 0; + String contents; + Header header; + public Mobi(String filename) + { + try { + this.file = new File(filename); + } + catch (IOError e) { + throw e; + } + + } + + void parse() throws IOException + { + byte compressed [] = Files.toByteArray(this.file); + this.contents = new String(compressed); + this.header = parseHeader(); + //this.records = self.parseRecordInfoList(); + //this..readRecord0() + } + + public int calcsize(String headerFormat) + { + int size = 0; + boolean is_number = false; + String number = ""; + String type = ""; + for(int i = 0; i < headerFormat.length(); i++) + { + char c = headerFormat.charAt(i); + if(Character.isDigit(c)) + { + number += c; + is_number = true; + } + else if (is_number){ + size += addNumberOfBytes(Integer.parseInt(number), c); + is_number = false; + number = ""; + } + else { + size += addNumberOfBytes(1, c); + } + } + return size; + } + + int addNumberOfBytes(int n, char c) + { + int base; + + switch (c) { + case 'c': base = 1; + break; + case 's': base = 1; + break; + case 'b': base = 1; + break; + case 'h': base = 2; + break; + case 'H': base = 2; + break; + case 'i': base = 4; + break; + case 'I': base = 4; + break; + case 'l': base = 4; + break; + case 'L': base = 4; + break; + case 'f': base = 4; + break; + case 'd': base = 8; + break; + default: base = 0; + break; + } + return base * n; + } + + Header parseHeader(){ + String headerfmt = "32shhIIIIII4s4sIIH"; + int headerlen = calcsize(headerfmt); + String headerData = this.contents.substring(this.offset, + this.offset+headerlen); + + Header parsedHeader = new Header(); //createHeaderBasedOn(headerData); + + /* + # unpack header, zip up into list of tuples + results = zip(fields, unpack(headerfmt, self.contents[self.offset:self.offset+headerlen])) + + # increment offset into file + this.offset += headerlen; + + # convert tuple array to dictionary + resultsDict = utils.toDict(results); + */ + return parsedHeader; + } + + public Header createHeaderBasedOn(File file) + { + Header headerFromText; + try { + Codec
codec = Codecs.create(Header.class); + headerFromText = Codecs.decode(codec, file); + } + catch( IOException e ){ + System.out.println(e.getCause()); + headerFromText = new Header(); + } + catch (DecodingException e) + { + System.out.println(e.getCause()); + headerFromText = new Header(); + } + return headerFromText; + } + /* + def readRecord(self, recordnum, disable_compression=False): + if self.config: + if self.config['palmdoc']['Compression'] == 1 or disable_compression: + return self.contents[self.records[recordnum]['record Data Offset']:self.records[recordnum+1]['record Data Offset']]; + elif self.config['palmdoc']['Compression'] == 2: + result = uncompress_lz77(self.contents[self.records[recordnum]['record Data Offset']:self.records[recordnum+1]['record Data Offset']-self.config['mobi']['extra bytes']]) + return result + + def readImageRecord(self, imgnum): + if self.config: + recordnum = self.config['mobi']['First Image index'] + imgnum; + return self.readRecord(recordnum, disable_compression=True); + + def author(self): + "Returns the author of the book" + return self.config['exth']['records'][100] + + def title(self): + "Returns the title of the book" + return self.config['mobi']['Full Name'] + */ +} diff --git a/MobiReader/src/mobireader/MobiHeader.java b/MobiReader/src/mobireader/MobiHeader.java new file mode 100644 index 0000000..f6bb54e --- /dev/null +++ b/MobiReader/src/mobireader/MobiHeader.java @@ -0,0 +1,92 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ +package mobireader; + +import nl.flotsam.preon.annotation.BoundNumber; +import nl.flotsam.preon.annotation.BoundString; + +/** + * + * @author att + */ +public class MobiHeader { + //headerfmt = '> IIII II 40s III IIIII IIII I 36s IIII 8s HHIIIII' + @BoundNumber(size="32") + long identifier; + @BoundNumber(size="32") + long header_length; + @BoundNumber(size="32") + long Mobi_type; + @BoundNumber(size="32") + long text_Encoding; + + @BoundNumber(size="32") + long Unique_ID; + @BoundNumber(size="32") + long Generator_version; + + @BoundString(size="40") + String Reserved; + + @BoundNumber(size="32") + long FirstNonBookIndex; + @BoundNumber(size="32") + long FullNameOffset; + @BoundNumber(size="32") + long FullNameLength; + + @BoundNumber(size="32") + long Language; + @BoundNumber(size="32") + long InputLanguage; + @BoundNumber(size="32") + long OutputLanguage; + @BoundNumber(size="32") + long FormatVersion; + @BoundNumber(size="32") + long FirstImageIndex; + + @BoundNumber(size="32") + long FirstHuffRecord; + @BoundNumber(size="32") + long HuffRecordCount; + @BoundNumber(size="32") + long FirstDATPRecord; + @BoundNumber(size="32") + long DATPRecordCount; + + @BoundNumber(size="32") + long EXTHFlags; + + @BoundString(size="36") + String unknown36Bytes; + + @BoundNumber(size="32") + long DRMOffset; + @BoundNumber(size="32") + long DRMCount; + @BoundNumber(size="32") + long DRMSize; + @BoundNumber(size="32") + long DRMFlags; + + @BoundString(size="8") + String unknown8Bytes; + + @BoundNumber(size="16") + int Unknown1; + @BoundNumber(size="16") + int LastImageRecord; + @BoundNumber(size="32") + long Unknown2; + @BoundNumber(size="32") + long FCISRecord; + @BoundNumber(size="32") + long Unknown3; + @BoundNumber(size="32") + long FLISRecord; + @BoundNumber(size="32") + long Unknown4; +} diff --git a/MobiReader/src/mobireader/MobiReader.java b/MobiReader/src/mobireader/MobiReader.java index 9a11700..e0a8489 100644 --- a/MobiReader/src/mobireader/MobiReader.java +++ b/MobiReader/src/mobireader/MobiReader.java @@ -34,5 +34,6 @@ public class MobiReader extends Application { public static void main(String[] args) { launch(args); + } } diff --git a/MobiReader/src/mobireader/PalmDocHeader.java b/MobiReader/src/mobireader/PalmDocHeader.java new file mode 100644 index 0000000..f3e1e08 --- /dev/null +++ b/MobiReader/src/mobireader/PalmDocHeader.java @@ -0,0 +1,30 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ +package mobireader; + +import nl.flotsam.preon.annotation.BoundNumber; + +/** + * + * @author att + */ +public class PalmDocHeader { + //headerfmt = '>HHIHHHH' + @BoundNumber(size="16") + int Compression; + @BoundNumber(size="16") + int Unused; + @BoundNumber(size="32") + long textLength; + @BoundNumber(size="16") + int recordCount; + @BoundNumber(size="16") + int recordSize; + @BoundNumber(size="16") + int encryptionType; + @BoundNumber(size="16") + int unknown; + +} diff --git a/MobiReader/src/unzipping/LZ77.java b/MobiReader/src/unzipping/LZ77.java new file mode 100644 index 0000000..a85707d --- /dev/null +++ b/MobiReader/src/unzipping/LZ77.java @@ -0,0 +1,289 @@ + /* + * The MIT License + * + * Copyright (c) 2009 Olle Törnström studiomediatech.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * CREDIT: Initially implemented by Diogo Kollross and made publicly available + * on the website http://www.geocities.com/diogok_br/lz77. Edited here + * to provide two flavours for JavaScript usage, either as standalone + * compressor/decompressor or as class for copy/paste use. + */ + +/** + * This class provides simple LZ77 compression and decompression. + * + * USAGE: Place in your own project package of choice adding preferred package + * setting. + * + * @author Olle Törnström olle[at]studiomediatech[dot]com + * @created 2009-02-18 + */ +package unzipping; + +public class LZ77 { + + private char referencePrefix; + private int referenceIntBase; + private int referenceIntFloorCode; + private int referenceIntCeilCode; + private int maxStringDistance; + private int minStringLength; + private int maxStringLength; + private int defaultWindowLength; + private int maxWindowLength; + + // CONSTRUCTOR + + public LZ77() { + + referencePrefix = '`'; + referenceIntBase = 96; + referenceIntFloorCode = (int) ' '; + referenceIntCeilCode = referenceIntFloorCode + referenceIntBase; + maxStringDistance = (int) Math.pow(referenceIntBase, 2) - 1; + minStringLength = 5; + maxStringLength = (int) Math.pow(referenceIntBase, 1) - 1 + + minStringLength; + defaultWindowLength = 144; + maxWindowLength = maxStringDistance + minStringLength; + } + + + // LAZY STATIC METHODS - ADDED BY: DAN! + public static String compressStr(String data) { + LZ77 lz = new LZ77(); + return lz.compress(data, null); + } + public static String decompressStr(String data) { + LZ77 lz = new LZ77(); + return lz.decompress(data); + } + + // PUBLIC METHODS + + /** + * Compress string data using the LZ77 algorithm. + * + * @param data + * String data to compress + * @return LZ77 compressed string + */ + public String compress(String data) { + + return compress(data, null); + } + + /** + * Compress string data using the LZ77 algorithm. + * + * @param data + * String data to compress + * @param windowLength + * Optional window length + * @return LZ77 compressed string + */ + public String compress(String data, Integer windowLength) { + + if (windowLength == null) + windowLength = defaultWindowLength; + + if (windowLength > maxWindowLength) + throw new IllegalArgumentException("Window length too large"); + + String compressed = ""; + + int pos = 0; + int lastPos = data.length() - minStringLength; + + while (pos < lastPos) { + + int searchStart = Math.max(pos - windowLength, 0); + int matchLength = minStringLength; + boolean foundMatch = false; + int bestMatchDistance = maxStringDistance; + int bestMatchLength = 0; + String newCompressed = null; + + while ((searchStart + matchLength) < pos) { + + int sourceWindowEnd = Math.min(searchStart + matchLength, data + .length()); + + int targetWindowEnd = Math + .min(pos + matchLength, data.length()); + + String m1 = data.substring(searchStart, sourceWindowEnd); + String m2 = data.substring(pos, targetWindowEnd); + + boolean isValidMatch = m1.equals(m2) + && matchLength < maxStringLength; + + if (isValidMatch) { + + matchLength++; + foundMatch = true; + + } else { + + int realMatchLength = matchLength - 1; + + if (foundMatch && (realMatchLength > bestMatchLength)) { + bestMatchDistance = pos - searchStart - realMatchLength; + bestMatchLength = realMatchLength; + } + + matchLength = minStringLength; + searchStart++; + foundMatch = false; + } + } + + if (bestMatchLength != 0) { + + newCompressed = referencePrefix + + encodeReferenceInt(bestMatchDistance, 2) + + encodeReferenceLength(bestMatchLength); + + pos += bestMatchLength; + + } else { + + if (data.charAt(pos) != referencePrefix) { + newCompressed = "" + data.charAt(pos); + } else { + newCompressed = "" + referencePrefix + referencePrefix; + } + + pos++; + } + compressed += newCompressed; + } + + return compressed + data.substring(pos).replaceAll("/`/g", "``"); + } + + public String decompress(String data) { + + String decompressed = ""; + int pos = 0; + + while (pos < data.length()) { + + char currentChar = data.charAt(pos); + + if (currentChar != referencePrefix) { + + decompressed += currentChar; + pos++; + + } else { + + char nextChar = data.charAt(pos + 1); + + if (nextChar != referencePrefix) { + + int distance = decodeReferenceInt(data.substring(pos + 1, + pos + 3), 2); + + int length = decodeReferenceLength(data.substring(pos + 3, + pos + 4)); + + int start = decompressed.length() - distance - length; + int end = start + length; + decompressed += decompressed.substring(start, end); + pos += minStringLength - 1; + + } else { + + decompressed += referencePrefix; + pos += 2; + } + } + } + + return decompressed; + } + + // PRIVATE METHODS + + private String encodeReferenceInt(int value, int width) { + + if ((value >= 0) && (value < (Math.pow(referenceIntBase, width) - 1))) { + + String encoded = ""; + + while (value > 0) { + char c = (char) ((value % referenceIntBase) + referenceIntFloorCode); + encoded = "" + c + encoded; + value = (int) Math.floor(value / referenceIntBase); + } + + int missingLength = width - encoded.length(); + + for (int i = 0; i < missingLength; i++) { + char c = (char) referenceIntFloorCode; + encoded = "" + c + encoded; + } + + return encoded; + + } else { + + throw new IllegalArgumentException("Reference int out of range: " + + value + " (width = " + width + ")"); + } + } + + private String encodeReferenceLength(int length) { + + return encodeReferenceInt(length - minStringLength, 1); + } + + private int decodeReferenceInt(String data, int width) { + + int value = 0; + + for (int i = 0; i < width; i++) { + + value *= referenceIntBase; + + int charCode = (int) data.charAt(i); + + if ((charCode >= referenceIntFloorCode) + && (charCode <= referenceIntCeilCode)) { + + value += charCode - referenceIntFloorCode; + + } else { + + throw new RuntimeException( + "Invalid char code in reference int: " + charCode); + } + } + + return value; + } + + private int decodeReferenceLength(String data) { + + return decodeReferenceInt(data, 1) + minStringLength; + } +} \ No newline at end of file diff --git a/MobiReader/src/unzipping/lz77Unzipper.java b/MobiReader/src/unzipping/lz77Unzipper.java new file mode 100644 index 0000000..93ba20d --- /dev/null +++ b/MobiReader/src/unzipping/lz77Unzipper.java @@ -0,0 +1,101 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ +package unzipping; + +/** + * + * @author att + */ +public class lz77Unzipper { + +} +/* +import struct +# ported directly from the PalmDoc Perl library +# http://kobesearch.cpan.org/htdocs/EBook-Tools/EBook/Tools/PalmDoc.pm.html + +def uncompress_lz77(data): + length = len(data); + offset = 0; # Current offset into data + # char; # Character being examined + # ord; # Ordinal of $char + # lz77; # 16-bit Lempel-Ziv 77 length-offset pair + # lz77offset; # LZ77 offset + # lz77length; # LZ77 length + # lz77pos; # Position inside $lz77length + text = ''; # Output (uncompressed) text + # textlength; # Length of uncompressed text during LZ77 pass + # textpos; # Position inside $text during LZ77 pass + + while offset < length: + # char = substr($data,$offset++,1); + char = data[offset]; + offset += 1; + ord_ = ord(char); + + # print " ".join([repr(char), hex(ord_)]) + + # The long if-elsif chain is the best logic for $ord handling + ## no critic (Cascading if-elsif chain) + if (ord_ == 0): + # Nulls are literal + text += char; + elif (ord_ <= 8): + # Next $ord bytes are literal + text += data[offset:offset+ord_] # text .=substr($data,$offset,ord); + offset += ord_; + elif (ord_ <= 0x7f): + # Values from 0x09 through 0x7f are literal + text += char; + elif (ord_ <= 0xbf): + # Data is LZ77-compressed + + # From Wikipedia: + # "A length-distance pair is always encoded by a two-byte + # sequence. Of the 16 bits that make up these two bytes, + # 11 bits go to encoding the distance, 3 go to encoding + # the length, and the remaining two are used to make sure + # the decoder can identify the first byte as the beginning + # of such a two-byte sequence." + + offset += 1; + if (offset > len(data)): + print("WARNING: offset to LZ77 bits is outside of the data: %d" % offset); + return text; + + lz77, = struct.unpack('>H', data[offset-2:offset]) + + # Leftmost two bits are ID bits and need to be dropped + lz77 &= 0x3fff; + + # Length is rightmost 3 bits + 3 + lz77length = (lz77 & 0x0007) + 3; + + # Remaining 11 bits are offset + lz77offset = lz77 >> 3; + if (lz77offset < 1): + print("WARNING: LZ77 decompression offset is invalid!"); + return text; + + # Getting text from the offset is a little tricky, because + # in theory you can be referring to characters you haven't + # actually decompressed yet. You therefore have to check + # the reference one character at a time. + textlength = len(text); + for lz77pos in range(lz77length): # for($lz77pos = 0; $lz77pos < $lz77length; $lz77pos++) + textpos = textlength - lz77offset; + if (textpos < 0): + print("WARNING: LZ77 decompression reference is before"+ + " beginning of text! %x" % lz77); + return; + + text += text[textpos:textpos+1]; #text .= substr($text,$textpos,1); + textlength+=1; + else: + # 0xc0 - 0xff are single characters (XOR 0x80) preceded by + # a space + text += ' ' + chr(ord_ ^ 0x80); + return text; +*/ \ No newline at end of file