Wrum, wrum, wrum. Java is awesome at parsing binary files.

2013-04-06 15:46:05 +02:00 · 2013-04-06 15:46:05 +02:00 · c3df7b0c97
parent 0252245893
commit c3df7b0c97
8 changed files with 737 additions and 0 deletions
--- a/MobiReader/src/mobireader/EXTHHeader.java
+++ b/MobiReader/src/mobireader/EXTHHeader.java
@ -0,0 +1,23 @@
 /*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
 package mobireader;
 import nl.flotsam.preon.annotation.BoundNumber;
 /**
 *
 * @author att
 */
 public class EXTHHeader {
    //headerfmt = '>III'
    @BoundNumber(size="32")
    long identifier;
    @BoundNumber(size="32")
    long headerLength;
    @BoundNumber(size="32")
    long recordCount;
 }
--- a/MobiReader/src/mobireader/Header.java
+++ b/MobiReader/src/mobireader/Header.java
@ -0,0 +1,29 @@
 /*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
 package mobireader;
 import nl.flotsam.preon.annotation.BoundNumber;
 import nl.flotsam.preon.annotation.BoundString;
 /**
 *
 * @author att
 */
 public class Header {
    public @BoundString(size="32") String name;
    @BoundNumber int attributes;
    @BoundNumber int version;
    @BoundNumber int created;
    @BoundNumber int modified;
    @BoundNumber int backup;
    @BoundNumber int modnum;
    @BoundNumber int appInfoId;
    @BoundNumber int sortInfoId;
    @BoundString(size="4") String type;
    @BoundString(size="4") String creator;
    @BoundNumber int uniqueIDseed;
    @BoundNumber int nextRecordListId;
    @BoundNumber int number_of_records;
 }
--- a/MobiReader/src/mobireader/Mobi.java
+++ b/MobiReader/src/mobireader/Mobi.java
@ -0,0 +1,172 @@
 /*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
 package mobireader;
 import com.google.common.io.Files;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
 import java.io.IOError;
 import java.io.IOException;
 import java.io.ObjectInputStream;
 import java.util.HashMap;
 import nl.flotsam.preon.Codec;
 import nl.flotsam.preon.Codecs;
 import nl.flotsam.preon.DecodingException;
 import nl.flotsam.preon.annotation.BoundNumber;
 import nl.flotsam.preon.annotation.BoundString;
 /**
 *
 * @author att
 */
 public class Mobi {
    File file;
    int offset = 0;
    String contents;
    Header header;
    public Mobi(String filename)
    {
      try {
        this.file = new File(filename);
      }
      catch (IOError e) {
        throw e;
      }
    }
    void parse() throws IOException
    {
        byte  compressed [] = Files.toByteArray(this.file);
        this.contents = new String(compressed);
        this.header = parseHeader();
        //this.records = self.parseRecordInfoList();
        //this..readRecord0()
    }
    public int calcsize(String headerFormat)
    {
        int size = 0;
        boolean is_number = false;
        String number = "";
        String type = "";
        for(int i = 0; i < headerFormat.length(); i++)
        {
            char c = headerFormat.charAt(i);
            if(Character.isDigit(c)) 
            {
                number += c;
                is_number = true;
            }
            else if (is_number){
                size += addNumberOfBytes(Integer.parseInt(number), c);
                is_number = false;
                number = "";
            }
            else {
                size += addNumberOfBytes(1, c);
            }
        }
        return size;
    }
    int addNumberOfBytes(int n, char c)
    {
        int base;
        switch (c) {
            case 'c':  base = 1;
                     break;
            case 's':  base = 1;
                     break;
            case 'b':  base = 1;
                     break;
            case 'h':  base = 2;
                     break;
            case 'H':  base = 2;
                     break;
            case 'i':  base = 4;
                     break;
            case 'I':  base = 4;
                     break;
            case 'l':  base = 4;
                     break;
            case 'L':  base = 4;
                     break;
            case 'f':  base = 4;
                     break;
            case 'd':  base = 8;
                     break;
            default: base = 0;
                     break;
        }
        return base * n;
    }
    Header parseHeader(){
        String headerfmt = "32shhIIIIII4s4sIIH";
        int headerlen = calcsize(headerfmt);
        String headerData = this.contents.substring(this.offset,
                                                    this.offset+headerlen);
        Header parsedHeader = new Header(); //createHeaderBasedOn(headerData);
        /*
        # unpack header, zip up into list of tuples
        results = zip(fields, unpack(headerfmt, self.contents[self.offset:self.offset+headerlen]))
        # increment offset into file
        this.offset += headerlen;
        # convert tuple array to dictionary
        resultsDict = utils.toDict(results);
        */
        return parsedHeader;
  }
  public Header createHeaderBasedOn(File file) 
  {
      Header headerFromText;
      try {
          Codec<Header> codec = Codecs.create(Header.class);
          headerFromText = Codecs.decode(codec, file);
      }
      catch( IOException e ){
          System.out.println(e.getCause());
          headerFromText = new Header();
      }
      catch (DecodingException e)
      {
          System.out.println(e.getCause());
          headerFromText = new Header();
      }
      return headerFromText;
  }
    /*
  def readRecord(self, recordnum, disable_compression=False):
    if self.config:
      if self.config['palmdoc']['Compression'] == 1 or disable_compression:
        return self.contents[self.records[recordnum]['record Data Offset']:self.records[recordnum+1]['record Data Offset']];
      elif self.config['palmdoc']['Compression'] == 2:
        result = uncompress_lz77(self.contents[self.records[recordnum]['record Data Offset']:self.records[recordnum+1]['record Data Offset']-self.config['mobi']['extra bytes']])
        return result
  def readImageRecord(self, imgnum):
    if self.config:
      recordnum = self.config['mobi']['First Image index'] + imgnum;
      return self.readRecord(recordnum, disable_compression=True);
  def author(self):
    "Returns the author of the book"
    return self.config['exth']['records'][100]
  def title(self):
    "Returns the title of the book"
    return self.config['mobi']['Full Name']
     */
 }
--- a/MobiReader/src/mobireader/MobiHeader.java
+++ b/MobiReader/src/mobireader/MobiHeader.java
@ -0,0 +1,92 @@
 /*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
 package mobireader;
 import nl.flotsam.preon.annotation.BoundNumber;
 import nl.flotsam.preon.annotation.BoundString;
 /**
 *
 * @author att
 */
 public class MobiHeader {
     //headerfmt = '> IIII II 40s III IIIII IIII I 36s IIII 8s HHIIIII'
     @BoundNumber(size="32")
     long identifier;
     @BoundNumber(size="32")
     long header_length;
     @BoundNumber(size="32")
     long Mobi_type;
     @BoundNumber(size="32")
     long text_Encoding;
     @BoundNumber(size="32")
     long Unique_ID;
     @BoundNumber(size="32")
     long Generator_version;
     @BoundString(size="40")
     String Reserved;
     @BoundNumber(size="32")
     long FirstNonBookIndex;
     @BoundNumber(size="32")
     long FullNameOffset;
     @BoundNumber(size="32")
     long FullNameLength;
     @BoundNumber(size="32")
     long Language;
     @BoundNumber(size="32")
     long InputLanguage;
     @BoundNumber(size="32")
     long OutputLanguage;
     @BoundNumber(size="32")
     long FormatVersion;
     @BoundNumber(size="32")
     long FirstImageIndex;
     @BoundNumber(size="32")
     long FirstHuffRecord;
     @BoundNumber(size="32")
     long HuffRecordCount;
     @BoundNumber(size="32")
     long FirstDATPRecord;
     @BoundNumber(size="32")
     long DATPRecordCount;
     @BoundNumber(size="32")
     long EXTHFlags;
     @BoundString(size="36")
     String unknown36Bytes;
     @BoundNumber(size="32")
     long DRMOffset;
     @BoundNumber(size="32")
     long DRMCount;
     @BoundNumber(size="32")
     long DRMSize;
     @BoundNumber(size="32")
     long DRMFlags;
     @BoundString(size="8")
     String unknown8Bytes;
     @BoundNumber(size="16")
     int Unknown1;
     @BoundNumber(size="16")
     int LastImageRecord;
     @BoundNumber(size="32")
     long Unknown2;
     @BoundNumber(size="32")
     long FCISRecord;
     @BoundNumber(size="32")
     long Unknown3;
     @BoundNumber(size="32")
     long FLISRecord;
     @BoundNumber(size="32")
     long Unknown4;
 }
--- a/MobiReader/src/mobireader/MobiReader.java
+++ b/MobiReader/src/mobireader/MobiReader.java
@ -34,5 +34,6 @@ public class MobiReader extends Application {
    public static void main(String[] args) {
        launch(args);
    }
 }
--- a/MobiReader/src/mobireader/PalmDocHeader.java
+++ b/MobiReader/src/mobireader/PalmDocHeader.java
@ -0,0 +1,30 @@
 /*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
 package mobireader;
 import nl.flotsam.preon.annotation.BoundNumber;
 /**
 *
 * @author att
 */
 public class PalmDocHeader {
    //headerfmt = '>HHIHHHH'
    @BoundNumber(size="16")
    int Compression;
    @BoundNumber(size="16")
    int Unused;
    @BoundNumber(size="32")
    long textLength;
    @BoundNumber(size="16")
    int recordCount;
    @BoundNumber(size="16")
    int recordSize;
    @BoundNumber(size="16")
    int encryptionType;
    @BoundNumber(size="16")
    int unknown;
 }
--- a/MobiReader/src/unzipping/LZ77.java
+++ b/MobiReader/src/unzipping/LZ77.java
@ -0,0 +1,289 @@
 /*
 * The MIT License
 * 
 * Copyright (c) 2009 Olle Törnström studiomediatech.com
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 *
 * CREDIT: Initially implemented by Diogo Kollross and made publicly available
 *         on the website http://www.geocities.com/diogok_br/lz77. Edited here
 *         to provide two flavours for JavaScript usage, either as standalone
 *         compressor/decompressor or as class for copy/paste use.
 */
 /**
 * This class provides simple LZ77 compression and decompression.
 * 
 * USAGE: Place in your own project package of choice adding preferred package
 *        setting.
 * 
 * @author Olle Törnström olle[at]studiomediatech[dot]com
 * @created 2009-02-18
 */
 package unzipping;
 public class LZ77 {
 	private char referencePrefix;
 	private int referenceIntBase;
 	private int referenceIntFloorCode;
 	private int referenceIntCeilCode;
 	private int maxStringDistance;
 	private int minStringLength;
 	private int maxStringLength;
 	private int defaultWindowLength;
 	private int maxWindowLength;
 	// CONSTRUCTOR
 	public LZ77() {
 		referencePrefix = '`';
 		referenceIntBase = 96;
 		referenceIntFloorCode = (int) ' ';
 		referenceIntCeilCode = referenceIntFloorCode + referenceIntBase;
 		maxStringDistance = (int) Math.pow(referenceIntBase, 2) - 1;
 		minStringLength = 5;
 		maxStringLength = (int) Math.pow(referenceIntBase, 1) - 1
 				+ minStringLength;
 		defaultWindowLength = 144;
 		maxWindowLength = maxStringDistance + minStringLength;
 	}
 	// LAZY STATIC METHODS - ADDED BY: DAN!
 	public static String compressStr(String data) {
 		LZ77 lz = new LZ77();
 		return lz.compress(data, null);
 	}
 	public static String decompressStr(String data) {
 		LZ77 lz = new LZ77();
 		return lz.decompress(data);
 	}
 	// PUBLIC METHODS
 	/**
 	 * Compress string data using the LZ77 algorithm.
 	 * 
 	 * @param data
 	 *            String data to compress
 	 * @return LZ77 compressed string
 	 */
 	public String compress(String data) {
 		return compress(data, null);
 	}
 	/**
 	 * Compress string data using the LZ77 algorithm.
 	 * 
 	 * @param data
 	 *            String data to compress
 	 * @param windowLength
 	 *            Optional window length
 	 * @return LZ77 compressed string
 	 */
 	public String compress(String data, Integer windowLength) {
 		if (windowLength == null)
 			windowLength = defaultWindowLength;
 		if (windowLength > maxWindowLength)
 			throw new IllegalArgumentException("Window length too large");
 		String compressed = "";
 		int pos = 0;
 		int lastPos = data.length() - minStringLength;
 		while (pos < lastPos) {
 			int searchStart = Math.max(pos - windowLength, 0);
 			int matchLength = minStringLength;
 			boolean foundMatch = false;
 			int bestMatchDistance = maxStringDistance;
 			int bestMatchLength = 0;
 			String newCompressed = null;
 			while ((searchStart + matchLength) < pos) {
 				int sourceWindowEnd = Math.min(searchStart + matchLength, data
 						.length());
 				int targetWindowEnd = Math
 						.min(pos + matchLength, data.length());
 				String m1 = data.substring(searchStart, sourceWindowEnd);
 				String m2 = data.substring(pos, targetWindowEnd);
 				boolean isValidMatch = m1.equals(m2)
 						&& matchLength < maxStringLength;
 				if (isValidMatch) {
 					matchLength++;
 					foundMatch = true;
 				} else {
 					int realMatchLength = matchLength - 1;
 					if (foundMatch && (realMatchLength > bestMatchLength)) {
 						bestMatchDistance = pos - searchStart - realMatchLength;
 						bestMatchLength = realMatchLength;
 					}
 					matchLength = minStringLength;
 					searchStart++;
 					foundMatch = false;
 				}
 			}
 			if (bestMatchLength != 0) {
 				newCompressed = referencePrefix
 						+ encodeReferenceInt(bestMatchDistance, 2)
 						+ encodeReferenceLength(bestMatchLength);
 				pos += bestMatchLength;
 			} else {
 				if (data.charAt(pos) != referencePrefix) {
 					newCompressed = "" + data.charAt(pos);
 				} else {
 					newCompressed = "" + referencePrefix + referencePrefix;
 				}
 				pos++;
 			}
 			compressed += newCompressed;
 		}
 		return compressed + data.substring(pos).replaceAll("/`/g", "``");
 	}
 	public String decompress(String data) {
 		String decompressed = "";
 		int pos = 0;
 		while (pos < data.length()) {
 			char currentChar = data.charAt(pos);
 			if (currentChar != referencePrefix) {
 				decompressed += currentChar;
 				pos++;
 			} else {
 				char nextChar = data.charAt(pos + 1);
 				if (nextChar != referencePrefix) {
 					int distance = decodeReferenceInt(data.substring(pos + 1,
 							pos + 3), 2);
 					int length = decodeReferenceLength(data.substring(pos + 3,
 							pos + 4));
 					int start = decompressed.length() - distance - length;
 					int end = start + length;
 					decompressed += decompressed.substring(start, end);
 					pos += minStringLength - 1;
 				} else {
 					decompressed += referencePrefix;
 					pos += 2;
 				}
 			}
 		}
 		return decompressed;
 	}
 	// PRIVATE METHODS
 	private String encodeReferenceInt(int value, int width) {
 		if ((value >= 0) && (value < (Math.pow(referenceIntBase, width) - 1))) {
 			String encoded = "";
 			while (value > 0) {
 				char c = (char) ((value % referenceIntBase) + referenceIntFloorCode);
 				encoded = "" + c + encoded;
 				value = (int) Math.floor(value / referenceIntBase);
 			}
 			int missingLength = width - encoded.length();
 			for (int i = 0; i < missingLength; i++) {
 				char c = (char) referenceIntFloorCode;
 				encoded = "" + c + encoded;
 			}
 			return encoded;
 		} else {
 			throw new IllegalArgumentException("Reference int out of range: "
 					+ value + " (width = " + width + ")");
 		}
 	}
 	private String encodeReferenceLength(int length) {
 		return encodeReferenceInt(length - minStringLength, 1);
 	}
 	private int decodeReferenceInt(String data, int width) {
 		int value = 0;
 		for (int i = 0; i < width; i++) {
 			value *= referenceIntBase;
 			int charCode = (int) data.charAt(i);
 			if ((charCode >= referenceIntFloorCode)
 					&& (charCode <= referenceIntCeilCode)) {
 				value += charCode - referenceIntFloorCode;
 			} else {
 				throw new RuntimeException(
 						"Invalid char code in reference int: " + charCode);
 			}
 		}
 		return value;
 	}
 	private int decodeReferenceLength(String data) {
 		return decodeReferenceInt(data, 1) + minStringLength;
 	}
 }
--- a/MobiReader/src/unzipping/lz77Unzipper.java
+++ b/MobiReader/src/unzipping/lz77Unzipper.java
@ -0,0 +1,101 @@
 /*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
 package unzipping;
 /**
 *
 * @author att
 */
 public class lz77Unzipper {
 }
 /*
 import struct
 # ported directly from the PalmDoc Perl library
 # http://kobesearch.cpan.org/htdocs/EBook-Tools/EBook/Tools/PalmDoc.pm.html
 def uncompress_lz77(data):
  length = len(data);
  offset = 0;   # Current offset into data
  # char;      # Character being examined
  # ord;      # Ordinal of $char
  # lz77;      # 16-bit Lempel-Ziv 77 length-offset pair
  # lz77offset;   # LZ77 offset
  # lz77length;   # LZ77 length
  # lz77pos;    # Position inside $lz77length
  text = '';   # Output (uncompressed) text
  # textlength;   # Length of uncompressed text during LZ77 pass
  # textpos;    # Position inside $text during LZ77 pass
  while offset < length:
    # char = substr($data,$offset++,1);
    char = data[offset];
    offset += 1;
    ord_ = ord(char);
    # print " ".join([repr(char), hex(ord_)])
    # The long if-elsif chain is the best logic for $ord handling
    ## no critic (Cascading if-elsif chain)
    if (ord_ == 0):
      # Nulls are literal
      text += char;
    elif (ord_ <= 8):
      # Next $ord bytes are literal
      text += data[offset:offset+ord_] # text .=substr($data,$offset,ord);
      offset += ord_;
    elif (ord_ <= 0x7f):
      # Values from 0x09 through 0x7f are literal
      text += char;
    elif (ord_ <= 0xbf):
      # Data is LZ77-compressed
      # From Wikipedia:
      # "A length-distance pair is always encoded by a two-byte
      # sequence. Of the 16 bits that make up these two bytes,
      # 11 bits go to encoding the distance, 3 go to encoding
      # the length, and the remaining two are used to make sure
      # the decoder can identify the first byte as the beginning
      # of such a two-byte sequence."
      offset += 1;
      if (offset > len(data)):
        print("WARNING: offset to LZ77 bits is outside of the data: %d" % offset);
        return text;
      lz77, = struct.unpack('>H', data[offset-2:offset])
      # Leftmost two bits are ID bits and need to be dropped
      lz77 &= 0x3fff;
      # Length is rightmost 3 bits + 3
      lz77length = (lz77 & 0x0007) + 3;
      # Remaining 11 bits are offset
      lz77offset = lz77 >> 3;
      if (lz77offset < 1):
        print("WARNING: LZ77 decompression offset is invalid!");
        return text;
      # Getting text from the offset is a little tricky, because
      # in theory you can be referring to characters you haven't
      # actually decompressed yet. You therefore have to check
      # the reference one character at a time.
      textlength = len(text);
      for lz77pos in range(lz77length): # for($lz77pos = 0; $lz77pos < $lz77length; $lz77pos++)
        textpos = textlength - lz77offset;
        if (textpos < 0):
          print("WARNING: LZ77 decompression reference is before"+
                " beginning of text! %x" % lz77);
          return;
        text += text[textpos:textpos+1]; #text .= substr($text,$textpos,1);
        textlength+=1;
    else:
      # 0xc0 - 0xff are single characters (XOR 0x80) preceded by
      # a space
      text += ' ' + chr(ord_ ^ 0x80);
  return text;
 */