Wrum, wrum, wrum. Java is awesome at parsing binary files.

2013-04-06 15:46:05 +02:00 · 2013-04-06 15:46:05 +02:00 · c3df7b0c97
parent 0252245893
commit c3df7b0c97
8 changed files with 737 additions and 0 deletions
--- a/MobiReader/src/mobireader/EXTHHeader.java
+++ b/MobiReader/src/mobireader/EXTHHeader.java
@ -0,0 +1,23 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package mobireader;
+
+import nl.flotsam.preon.annotation.BoundNumber;
+
+/**
+ *
+ * @author att
+ */
+public class EXTHHeader {
+    //headerfmt = '>III'
+    @BoundNumber(size="32")
+    long identifier;
+    @BoundNumber(size="32")
+    long headerLength;
+    @BoundNumber(size="32")
+    long recordCount;
+  
+
+}
--- a/MobiReader/src/mobireader/Header.java
+++ b/MobiReader/src/mobireader/Header.java
@ -0,0 +1,29 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package mobireader;
+
+import nl.flotsam.preon.annotation.BoundNumber;
+import nl.flotsam.preon.annotation.BoundString;
+
+/**
+ *
+ * @author att
+ */
+public class Header {
+    public @BoundString(size="32") String name;
+    @BoundNumber int attributes;
+    @BoundNumber int version;
+    @BoundNumber int created;
+    @BoundNumber int modified;
+    @BoundNumber int backup;
+    @BoundNumber int modnum;
+    @BoundNumber int appInfoId;
+    @BoundNumber int sortInfoId;
+    @BoundString(size="4") String type;
+    @BoundString(size="4") String creator;
+    @BoundNumber int uniqueIDseed;
+    @BoundNumber int nextRecordListId;
+    @BoundNumber int number_of_records;
+}
--- a/MobiReader/src/mobireader/Mobi.java
+++ b/MobiReader/src/mobireader/Mobi.java
@ -0,0 +1,172 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package mobireader;
+
+import com.google.common.io.Files;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOError;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.util.HashMap;
+import nl.flotsam.preon.Codec;
+import nl.flotsam.preon.Codecs;
+import nl.flotsam.preon.DecodingException;
+import nl.flotsam.preon.annotation.BoundNumber;
+import nl.flotsam.preon.annotation.BoundString;
+/**
+ *
+ * @author att
+ */
+
+
+
+public class Mobi {
+    File file;
+    int offset = 0;
+    String contents;
+    Header header;
+    public Mobi(String filename)
+    {
+      try {
+        this.file = new File(filename);
+      }
+      catch (IOError e) {
+        throw e;
+      }
+    
+    }
+    
+    void parse() throws IOException
+    {
+        byte  compressed [] = Files.toByteArray(this.file);
+        this.contents = new String(compressed);
+        this.header = parseHeader();
+        //this.records = self.parseRecordInfoList();
+        //this..readRecord0()
+    }
+    
+    public int calcsize(String headerFormat)
+    {
+        int size = 0;
+        boolean is_number = false;
+        String number = "";
+        String type = "";
+        for(int i = 0; i < headerFormat.length(); i++)
+        {
+            char c = headerFormat.charAt(i);
+            if(Character.isDigit(c)) 
+            {
+                number += c;
+                is_number = true;
+            }
+            else if (is_number){
+                size += addNumberOfBytes(Integer.parseInt(number), c);
+                is_number = false;
+                number = "";
+            }
+            else {
+                size += addNumberOfBytes(1, c);
+            }
+        }
+        return size;
+    }
+    
+    int addNumberOfBytes(int n, char c)
+    {
+        int base;
+ 
+        switch (c) {
+            case 'c':  base = 1;
+                     break;
+            case 's':  base = 1;
+                     break;
+            case 'b':  base = 1;
+                     break;
+            case 'h':  base = 2;
+                     break;
+            case 'H':  base = 2;
+                     break;
+            case 'i':  base = 4;
+                     break;
+            case 'I':  base = 4;
+                     break;
+            case 'l':  base = 4;
+                     break;
+            case 'L':  base = 4;
+                     break;
+            case 'f':  base = 4;
+                     break;
+            case 'd':  base = 8;
+                     break;
+            default: base = 0;
+                     break;
+        }
+        return base * n;
+    }
+    
+    Header parseHeader(){
+        String headerfmt = "32shhIIIIII4s4sIIH";
+        int headerlen = calcsize(headerfmt);
+        String headerData = this.contents.substring(this.offset,
+                                                    this.offset+headerlen);
+        
+        Header parsedHeader = new Header(); //createHeaderBasedOn(headerData);
+        
+        /*
+        # unpack header, zip up into list of tuples
+        results = zip(fields, unpack(headerfmt, self.contents[self.offset:self.offset+headerlen]))
+
+        # increment offset into file
+        this.offset += headerlen;
+
+        # convert tuple array to dictionary
+        resultsDict = utils.toDict(results);
+        */
+        return parsedHeader;
+  }
+  
+  public Header createHeaderBasedOn(File file) 
+  {
+      Header headerFromText;
+      try {
+          Codec<Header> codec = Codecs.create(Header.class);
+          headerFromText = Codecs.decode(codec, file);
+      }
+      catch( IOException e ){
+          System.out.println(e.getCause());
+          headerFromText = new Header();
+      }
+      catch (DecodingException e)
+      {
+          System.out.println(e.getCause());
+          headerFromText = new Header();
+      }
+      return headerFromText;
+  }
+    /*
+  def readRecord(self, recordnum, disable_compression=False):
+    if self.config:
+      if self.config['palmdoc']['Compression'] == 1 or disable_compression:
+        return self.contents[self.records[recordnum]['record Data Offset']:self.records[recordnum+1]['record Data Offset']];
+      elif self.config['palmdoc']['Compression'] == 2:
+        result = uncompress_lz77(self.contents[self.records[recordnum]['record Data Offset']:self.records[recordnum+1]['record Data Offset']-self.config['mobi']['extra bytes']])
+        return result
+
+  def readImageRecord(self, imgnum):
+    if self.config:
+      recordnum = self.config['mobi']['First Image index'] + imgnum;
+      return self.readRecord(recordnum, disable_compression=True);
+
+  def author(self):
+    "Returns the author of the book"
+    return self.config['exth']['records'][100]
+
+  def title(self):
+    "Returns the title of the book"
+    return self.config['mobi']['Full Name']
+     */
+}
--- a/MobiReader/src/mobireader/MobiHeader.java
+++ b/MobiReader/src/mobireader/MobiHeader.java
@ -0,0 +1,92 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package mobireader;
+
+import nl.flotsam.preon.annotation.BoundNumber;
+import nl.flotsam.preon.annotation.BoundString;
+
+/**
+ *
+ * @author att
+ */
+public class MobiHeader {
+     //headerfmt = '> IIII II 40s III IIIII IIII I 36s IIII 8s HHIIIII'
+     @BoundNumber(size="32")
+     long identifier;
+     @BoundNumber(size="32")
+     long header_length;
+     @BoundNumber(size="32")
+     long Mobi_type;
+     @BoundNumber(size="32")
+     long text_Encoding;
+     
+     @BoundNumber(size="32")
+     long Unique_ID;
+     @BoundNumber(size="32")
+     long Generator_version;
+     
+     @BoundString(size="40")
+     String Reserved;
+     
+     @BoundNumber(size="32")
+     long FirstNonBookIndex;
+     @BoundNumber(size="32")
+     long FullNameOffset;
+     @BoundNumber(size="32")
+     long FullNameLength;
+
+     @BoundNumber(size="32")
+     long Language;
+     @BoundNumber(size="32")
+     long InputLanguage;
+     @BoundNumber(size="32")
+     long OutputLanguage;
+     @BoundNumber(size="32")
+     long FormatVersion;
+     @BoundNumber(size="32")
+     long FirstImageIndex;
+
+     @BoundNumber(size="32")
+     long FirstHuffRecord;
+     @BoundNumber(size="32")
+     long HuffRecordCount;
+     @BoundNumber(size="32")
+     long FirstDATPRecord;
+     @BoundNumber(size="32")
+     long DATPRecordCount;
+
+     @BoundNumber(size="32")
+     long EXTHFlags;
+
+     @BoundString(size="36")
+     String unknown36Bytes;
+
+     @BoundNumber(size="32")
+     long DRMOffset;
+     @BoundNumber(size="32")
+     long DRMCount;
+     @BoundNumber(size="32")
+     long DRMSize;
+     @BoundNumber(size="32")
+     long DRMFlags;
+
+     @BoundString(size="8")
+     String unknown8Bytes;
+     
+     @BoundNumber(size="16")
+     int Unknown1;
+     @BoundNumber(size="16")
+     int LastImageRecord;
+     @BoundNumber(size="32")
+     long Unknown2;
+     @BoundNumber(size="32")
+     long FCISRecord;
+     @BoundNumber(size="32")
+     long Unknown3;
+     @BoundNumber(size="32")
+     long FLISRecord;
+     @BoundNumber(size="32")
+     long Unknown4;
+}
--- a/MobiReader/src/mobireader/MobiReader.java
+++ b/MobiReader/src/mobireader/MobiReader.java
@ -34,5 +34,6 @@ public class MobiReader extends Application {

    public static void main(String[] args) {
        launch(args);
+        
    }
 }
--- a/MobiReader/src/mobireader/PalmDocHeader.java
+++ b/MobiReader/src/mobireader/PalmDocHeader.java
@ -0,0 +1,30 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package mobireader;
+
+import nl.flotsam.preon.annotation.BoundNumber;
+
+/**
+ *
+ * @author att
+ */
+public class PalmDocHeader {
+    //headerfmt = '>HHIHHHH'
+    @BoundNumber(size="16")
+    int Compression;
+    @BoundNumber(size="16")
+    int Unused;
+    @BoundNumber(size="32")
+    long textLength;
+    @BoundNumber(size="16")
+    int recordCount;
+    @BoundNumber(size="16")
+    int recordSize;
+    @BoundNumber(size="16")
+    int encryptionType;
+    @BoundNumber(size="16")
+    int unknown;
+    
+}
--- a/MobiReader/src/unzipping/LZ77.java
+++ b/MobiReader/src/unzipping/LZ77.java
@ -0,0 +1,289 @@
+ /*
+ * The MIT License
+ * 
+ * Copyright (c) 2009 Olle Törnström studiomediatech.com
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ * CREDIT: Initially implemented by Diogo Kollross and made publicly available
+ *         on the website http://www.geocities.com/diogok_br/lz77. Edited here
+ *         to provide two flavours for JavaScript usage, either as standalone
+ *         compressor/decompressor or as class for copy/paste use.
+ */
+
+/**
+ * This class provides simple LZ77 compression and decompression.
+ * 
+ * USAGE: Place in your own project package of choice adding preferred package
+ *        setting.
+ * 
+ * @author Olle Törnström olle[at]studiomediatech[dot]com
+ * @created 2009-02-18
+ */
+package unzipping;
+
+public class LZ77 {
+
+	private char referencePrefix;
+	private int referenceIntBase;
+	private int referenceIntFloorCode;
+	private int referenceIntCeilCode;
+	private int maxStringDistance;
+	private int minStringLength;
+	private int maxStringLength;
+	private int defaultWindowLength;
+	private int maxWindowLength;
+
+	// CONSTRUCTOR
+
+	public LZ77() {
+
+		referencePrefix = '`';
+		referenceIntBase = 96;
+		referenceIntFloorCode = (int) ' ';
+		referenceIntCeilCode = referenceIntFloorCode + referenceIntBase;
+		maxStringDistance = (int) Math.pow(referenceIntBase, 2) - 1;
+		minStringLength = 5;
+		maxStringLength = (int) Math.pow(referenceIntBase, 1) - 1
+				+ minStringLength;
+		defaultWindowLength = 144;
+		maxWindowLength = maxStringDistance + minStringLength;
+	}
+
+
+	// LAZY STATIC METHODS - ADDED BY: DAN!
+	public static String compressStr(String data) {
+		LZ77 lz = new LZ77();
+		return lz.compress(data, null);
+	}
+	public static String decompressStr(String data) {
+		LZ77 lz = new LZ77();
+		return lz.decompress(data);
+	}
+
+	// PUBLIC METHODS
+
+	/**
+	 * Compress string data using the LZ77 algorithm.
+	 * 
+	 * @param data
+	 *            String data to compress
+	 * @return LZ77 compressed string
+	 */
+	public String compress(String data) {
+
+		return compress(data, null);
+	}
+
+	/**
+	 * Compress string data using the LZ77 algorithm.
+	 * 
+	 * @param data
+	 *            String data to compress
+	 * @param windowLength
+	 *            Optional window length
+	 * @return LZ77 compressed string
+	 */
+	public String compress(String data, Integer windowLength) {
+
+		if (windowLength == null)
+			windowLength = defaultWindowLength;
+
+		if (windowLength > maxWindowLength)
+			throw new IllegalArgumentException("Window length too large");
+
+		String compressed = "";
+
+		int pos = 0;
+		int lastPos = data.length() - minStringLength;
+
+		while (pos < lastPos) {
+
+			int searchStart = Math.max(pos - windowLength, 0);
+			int matchLength = minStringLength;
+			boolean foundMatch = false;
+			int bestMatchDistance = maxStringDistance;
+			int bestMatchLength = 0;
+			String newCompressed = null;
+
+			while ((searchStart + matchLength) < pos) {
+
+				int sourceWindowEnd = Math.min(searchStart + matchLength, data
+						.length());
+
+				int targetWindowEnd = Math
+						.min(pos + matchLength, data.length());
+
+				String m1 = data.substring(searchStart, sourceWindowEnd);
+				String m2 = data.substring(pos, targetWindowEnd);
+
+				boolean isValidMatch = m1.equals(m2)
+						&& matchLength < maxStringLength;
+
+				if (isValidMatch) {
+
+					matchLength++;
+					foundMatch = true;
+
+				} else {
+
+					int realMatchLength = matchLength - 1;
+
+					if (foundMatch && (realMatchLength > bestMatchLength)) {
+						bestMatchDistance = pos - searchStart - realMatchLength;
+						bestMatchLength = realMatchLength;
+					}
+
+					matchLength = minStringLength;
+					searchStart++;
+					foundMatch = false;
+				}
+			}
+
+			if (bestMatchLength != 0) {
+
+				newCompressed = referencePrefix
+						+ encodeReferenceInt(bestMatchDistance, 2)
+						+ encodeReferenceLength(bestMatchLength);
+
+				pos += bestMatchLength;
+
+			} else {
+
+				if (data.charAt(pos) != referencePrefix) {
+					newCompressed = "" + data.charAt(pos);
+				} else {
+					newCompressed = "" + referencePrefix + referencePrefix;
+				}
+
+				pos++;
+			}
+			compressed += newCompressed;
+		}
+
+		return compressed + data.substring(pos).replaceAll("/`/g", "``");
+	}
+
+	public String decompress(String data) {
+
+		String decompressed = "";
+		int pos = 0;
+
+		while (pos < data.length()) {
+
+			char currentChar = data.charAt(pos);
+
+			if (currentChar != referencePrefix) {
+
+				decompressed += currentChar;
+				pos++;
+
+			} else {
+
+				char nextChar = data.charAt(pos + 1);
+
+				if (nextChar != referencePrefix) {
+
+					int distance = decodeReferenceInt(data.substring(pos + 1,
+							pos + 3), 2);
+
+					int length = decodeReferenceLength(data.substring(pos + 3,
+							pos + 4));
+
+					int start = decompressed.length() - distance - length;
+					int end = start + length;
+					decompressed += decompressed.substring(start, end);
+					pos += minStringLength - 1;
+
+				} else {
+
+					decompressed += referencePrefix;
+					pos += 2;
+				}
+			}
+		}
+
+		return decompressed;
+	}
+
+	// PRIVATE METHODS
+
+	private String encodeReferenceInt(int value, int width) {
+
+		if ((value >= 0) && (value < (Math.pow(referenceIntBase, width) - 1))) {
+
+			String encoded = "";
+
+			while (value > 0) {
+				char c = (char) ((value % referenceIntBase) + referenceIntFloorCode);
+				encoded = "" + c + encoded;
+				value = (int) Math.floor(value / referenceIntBase);
+			}
+
+			int missingLength = width - encoded.length();
+
+			for (int i = 0; i < missingLength; i++) {
+				char c = (char) referenceIntFloorCode;
+				encoded = "" + c + encoded;
+			}
+
+			return encoded;
+
+		} else {
+
+			throw new IllegalArgumentException("Reference int out of range: "
+					+ value + " (width = " + width + ")");
+		}
+	}
+
+	private String encodeReferenceLength(int length) {
+
+		return encodeReferenceInt(length - minStringLength, 1);
+	}
+
+	private int decodeReferenceInt(String data, int width) {
+
+		int value = 0;
+
+		for (int i = 0; i < width; i++) {
+
+			value *= referenceIntBase;
+
+			int charCode = (int) data.charAt(i);
+
+			if ((charCode >= referenceIntFloorCode)
+					&& (charCode <= referenceIntCeilCode)) {
+
+				value += charCode - referenceIntFloorCode;
+
+			} else {
+
+				throw new RuntimeException(
+						"Invalid char code in reference int: " + charCode);
+			}
+		}
+
+		return value;
+	}
+
+	private int decodeReferenceLength(String data) {
+
+		return decodeReferenceInt(data, 1) + minStringLength;
+	}
+}
--- a/MobiReader/src/unzipping/lz77Unzipper.java
+++ b/MobiReader/src/unzipping/lz77Unzipper.java
@ -0,0 +1,101 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package unzipping;
+
+/**
+ *
+ * @author att
+ */
+public class lz77Unzipper {
+    
+}
+/*
+import struct
+# ported directly from the PalmDoc Perl library
+# http://kobesearch.cpan.org/htdocs/EBook-Tools/EBook/Tools/PalmDoc.pm.html
+
+def uncompress_lz77(data):
+  length = len(data);
+  offset = 0;   # Current offset into data
+  # char;      # Character being examined
+  # ord;      # Ordinal of $char
+  # lz77;      # 16-bit Lempel-Ziv 77 length-offset pair
+  # lz77offset;   # LZ77 offset
+  # lz77length;   # LZ77 length
+  # lz77pos;    # Position inside $lz77length
+  text = '';   # Output (uncompressed) text
+  # textlength;   # Length of uncompressed text during LZ77 pass
+  # textpos;    # Position inside $text during LZ77 pass
+
+  while offset < length:
+    # char = substr($data,$offset++,1);
+    char = data[offset];
+    offset += 1;
+    ord_ = ord(char);
+
+    # print " ".join([repr(char), hex(ord_)])
+
+    # The long if-elsif chain is the best logic for $ord handling
+    ## no critic (Cascading if-elsif chain)
+    if (ord_ == 0):
+      # Nulls are literal
+      text += char;
+    elif (ord_ <= 8):
+      # Next $ord bytes are literal
+      text += data[offset:offset+ord_] # text .=substr($data,$offset,ord);
+      offset += ord_;
+    elif (ord_ <= 0x7f):
+      # Values from 0x09 through 0x7f are literal
+      text += char;
+    elif (ord_ <= 0xbf):
+      # Data is LZ77-compressed
+
+      # From Wikipedia:
+      # "A length-distance pair is always encoded by a two-byte
+      # sequence. Of the 16 bits that make up these two bytes,
+      # 11 bits go to encoding the distance, 3 go to encoding
+      # the length, and the remaining two are used to make sure
+      # the decoder can identify the first byte as the beginning
+      # of such a two-byte sequence."
+
+      offset += 1;
+      if (offset > len(data)):
+        print("WARNING: offset to LZ77 bits is outside of the data: %d" % offset);
+        return text;
+
+      lz77, = struct.unpack('>H', data[offset-2:offset])
+
+      # Leftmost two bits are ID bits and need to be dropped
+      lz77 &= 0x3fff;
+
+      # Length is rightmost 3 bits + 3
+      lz77length = (lz77 & 0x0007) + 3;
+
+      # Remaining 11 bits are offset
+      lz77offset = lz77 >> 3;
+      if (lz77offset < 1):
+        print("WARNING: LZ77 decompression offset is invalid!");
+        return text;
+
+      # Getting text from the offset is a little tricky, because
+      # in theory you can be referring to characters you haven't
+      # actually decompressed yet. You therefore have to check
+      # the reference one character at a time.
+      textlength = len(text);
+      for lz77pos in range(lz77length): # for($lz77pos = 0; $lz77pos < $lz77length; $lz77pos++)
+        textpos = textlength - lz77offset;
+        if (textpos < 0):
+          print("WARNING: LZ77 decompression reference is before"+
+                " beginning of text! %x" % lz77);
+          return;
+
+        text += text[textpos:textpos+1]; #text .= substr($text,$textpos,1);
+        textlength+=1;
+    else:
+      # 0xc0 - 0xff are single characters (XOR 0x80) preceded by
+      # a space
+      text += ' ' + chr(ord_ ^ 0x80);
+  return text;
+*/