diff --git a/build.gradle b/build.gradle
index 67c0f6d..bb2881b 100644
--- a/build.gradle
+++ b/build.gradle
@@ -41,6 +41,7 @@ dependencies {
include "org.tukaani:xz:1.8"
modCompile 'com.github.shevek:parallelgzip:master-SNAPSHOT'
+ include 'com.github.shevek:parallelgzip:master-SNAPSHOT'
}
processResources {
diff --git a/src/Copyright_Notice.txt b/src/Copyright_Notice.txt
index b755408..4740a3f 100644
--- a/src/Copyright_Notice.txt
+++ b/src/Copyright_Notice.txt
@@ -1,7 +1,9 @@
This project uses third party libraries as its dependencies and includes them in jar. Those are :
- Apache Commons Compress version 1.20 licensed under Apache License Version 2.0 which can be found at http://www.apache.org/licenses/
+ Apache Commons Compress licensed under Apache License Version 2.0 which can be found at http://www.apache.org/licenses/
Cotton config, Cotton logging, and Jankson-Fabric all by Cotton team licensed under MIT license which can be found at https://github.com/CottonMC/Cotton
XZ for Java by Tukaani released as public domain. https://tukaani.org/xz/java.html
+ parallelgzip by shevek under Apache 2.0 http://www.apache.org/licenses/
+ Parallel BZip2 compression by Karl Gustafsson at http://at4j.sourceforge.net/ under GPL v3
Some code was partially or fully inspired by:
Parallel zip compression: https://stackoverflow.com/questions/54624695/how-to-implement-parallel-zip-creation-with-scatterzipoutputstream-with-zip64-su
diff --git a/src/main/java/net/szum123321/textile_backup/core/MakeBackupThread.java b/src/main/java/net/szum123321/textile_backup/core/MakeBackupThread.java
index 4cb4677..d5700ea 100644
--- a/src/main/java/net/szum123321/textile_backup/core/MakeBackupThread.java
+++ b/src/main/java/net/szum123321/textile_backup/core/MakeBackupThread.java
@@ -23,11 +23,11 @@ import net.minecraft.server.command.ServerCommandSource;
import net.minecraft.world.dimension.DimensionType;
import net.szum123321.textile_backup.TextileBackup;
import net.szum123321.textile_backup.core.compressors.GenericTarCompressor;
+import net.szum123321.textile_backup.core.compressors.ParallelBZip2Compressor;
import net.szum123321.textile_backup.core.compressors.ParallelZipCompressor;
import org.anarres.parallelgzip.ParallelGZIPOutputStream;
-import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream;
-import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
import org.apache.commons.compress.compressors.xz.XZCompressorOutputStream;
+import org.at4j.comp.bzip2.BZip2OutputStream;
import java.io.File;
import java.io.IOException;
@@ -72,7 +72,7 @@ public class MakeBackupThread implements Runnable {
break;
case BZIP2:
- GenericTarCompressor.createArchive(world, outFile, BZip2CompressorOutputStream.class, ctx);
+ ParallelBZip2Compressor.createArchive(world, outFile, ctx);
break;
case GZIP:
diff --git a/src/main/java/net/szum123321/textile_backup/core/compressors/GenericTarCompressor.java b/src/main/java/net/szum123321/textile_backup/core/compressors/GenericTarCompressor.java
index 5c07d06..85dffb7 100644
--- a/src/main/java/net/szum123321/textile_backup/core/compressors/GenericTarCompressor.java
+++ b/src/main/java/net/szum123321/textile_backup/core/compressors/GenericTarCompressor.java
@@ -5,7 +5,6 @@ import net.szum123321.textile_backup.TextileBackup;
import net.szum123321.textile_backup.core.Utilities;
import org.apache.commons.compress.archivers.ArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
-import org.apache.commons.compress.compressors.CompressorOutputStream;
import org.apache.commons.compress.utils.IOUtils;
@@ -29,15 +28,15 @@ public class GenericTarCompressor {
File input = in.getCanonicalFile();
- Files.walk(input.toPath()).filter(
- path -> !path.equals(input.toPath()) &&
- path.toFile().isFile() &&
- !Utilities.isBlacklisted(input.toPath().relativize(path))
+ Files.walk(input.toPath()
+ ).filter(path -> !path.equals(input.toPath()) &&
+ path.toFile().isFile() &&
+ !Utilities.isBlacklisted(input.toPath().relativize(path))
).forEach(path -> {
File file = path.toAbsolutePath().toFile();
try (FileInputStream fin = new FileInputStream(file);
- BufferedInputStream bfin = new BufferedInputStream(fin)){
+ BufferedInputStream bfin = new BufferedInputStream(fin)) {
ArchiveEntry entry = arc.createArchiveEntry(file, input.toPath().relativize(path).toString());
arc.putArchiveEntry(entry);
diff --git a/src/main/java/net/szum123321/textile_backup/core/compressors/ParallelBZip2Compressor.java b/src/main/java/net/szum123321/textile_backup/core/compressors/ParallelBZip2Compressor.java
new file mode 100644
index 0000000..bc0c3c2
--- /dev/null
+++ b/src/main/java/net/szum123321/textile_backup/core/compressors/ParallelBZip2Compressor.java
@@ -0,0 +1,62 @@
+package net.szum123321.textile_backup.core.compressors;
+
+import net.minecraft.server.command.ServerCommandSource;
+import net.szum123321.textile_backup.TextileBackup;
+import net.szum123321.textile_backup.core.Utilities;
+import org.apache.commons.compress.archivers.ArchiveEntry;
+import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
+import org.apache.commons.compress.utils.IOUtils;
+import org.at4j.comp.bzip2.BZip2OutputStream;
+import org.at4j.comp.bzip2.BZip2OutputStreamSettings;
+
+import java.io.*;
+import java.nio.file.Files;
+
+public class ParallelBZip2Compressor {
+ public static void createArchive(File in, File out, ServerCommandSource ctx) {
+ Utilities.log("Starting compression...", ctx);
+
+ BZip2OutputStreamSettings settings = new BZip2OutputStreamSettings().setNumberOfEncoderThreads(Runtime.getRuntime().availableProcessors());
+
+ long start = System.nanoTime();
+
+ try (FileOutputStream fileOutputStream = new FileOutputStream(out);
+ BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(fileOutputStream);
+ BZip2OutputStream bZip2OutputStream = new BZip2OutputStream(bufferedOutputStream, settings);
+ TarArchiveOutputStream arc = new TarArchiveOutputStream(bZip2OutputStream)) {
+
+ arc.setLongFileMode(TarArchiveOutputStream.LONGFILE_POSIX);
+ arc.setBigNumberMode(TarArchiveOutputStream.BIGNUMBER_POSIX);
+
+ File input = in.getCanonicalFile();
+
+ Files.walk(input.toPath()
+ ).filter(path -> !path.equals(input.toPath()) &&
+ path.toFile().isFile() &&
+ !Utilities.isBlacklisted(input.toPath().relativize(path))
+ ).forEach(path -> {
+ File file = path.toAbsolutePath().toFile();
+
+ try (FileInputStream fin = new FileInputStream(file);
+ BufferedInputStream bfin = new BufferedInputStream(fin)) {
+ ArchiveEntry entry = arc.createArchiveEntry(file, input.toPath().relativize(path).toString());
+
+ arc.putArchiveEntry(entry);
+ IOUtils.copy(bfin, arc);
+
+ arc.closeArchiveEntry();
+ } catch (IOException e) {
+ TextileBackup.logger.error(e.getMessage());
+ }
+ });
+
+ arc.finish();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ long end = System.nanoTime();
+
+ Utilities.log("Compression took: " + ((end - start) / 1000000000.0) + "s", ctx);
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/net/szum123321/textile_backup/core/compressors/ParallelZipCompressor.java b/src/main/java/net/szum123321/textile_backup/core/compressors/ParallelZipCompressor.java
index 867c175..11a8ae5 100644
--- a/src/main/java/net/szum123321/textile_backup/core/compressors/ParallelZipCompressor.java
+++ b/src/main/java/net/szum123321/textile_backup/core/compressors/ParallelZipCompressor.java
@@ -24,7 +24,7 @@ public class ParallelZipCompressor {
public static void createArchive(File in, File out, ServerCommandSource ctx) {
Utilities.log("Starting compression...", ctx);
- long start = System.nanoTime();;
+ long start = System.nanoTime();
try (FileOutputStream fileOutputStream = new FileOutputStream(out);
BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(fileOutputStream);
@@ -39,15 +39,15 @@ public class ParallelZipCompressor {
File input = in.getCanonicalFile();
- Files.walk(input.toPath()).filter(
- path -> !path.equals(input.toPath()) &&
- path.toFile().isFile() &&
- !Utilities.isBlacklisted(input.toPath().relativize(path))
+ Files.walk(input.toPath()
+ ).filter(path -> !path.equals(input.toPath()) &&
+ path.toFile().isFile() &&
+ !Utilities.isBlacklisted(input.toPath().relativize(path))
).forEach(p -> {
- ZipArchiveEntry entry = new ZipArchiveEntry(input.toPath().relativize(p).toString());
- entry.setMethod(ZipEntry.DEFLATED);
- FileInputStreamSupplier supplier = new FileInputStreamSupplier(p);
- scatterZipCreator.addArchiveEntry(entry, supplier);
+ ZipArchiveEntry entry = new ZipArchiveEntry(input.toPath().relativize(p).toString());
+ entry.setMethod(ZipEntry.DEFLATED);
+ FileInputStreamSupplier supplier = new FileInputStreamSupplier(p);
+ scatterZipCreator.addArchiveEntry(entry, supplier);
});
scatterZipCreator.writeTo(arc);
diff --git a/src/main/java/org/at4j/comp/CompressionLevel.java b/src/main/java/org/at4j/comp/CompressionLevel.java
new file mode 100644
index 0000000..2f1fed0
--- /dev/null
+++ b/src/main/java/org/at4j/comp/CompressionLevel.java
@@ -0,0 +1,43 @@
+/* AT4J -- Archive file tools for Java -- http://www.at4j.org
+ * Copyright (C) 2009 Karl Gustafsson
+ *
+ * This file is a part of AT4J
+ *
+ * AT4J is free software: you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+package org.at4j.comp;
+
+/**
+ * This is an enumeration over different generic compression levels supported by
+ * some of At4J's compression algorithm.
+ * @author Karl Gustafsson
+ * @since 1.0.2
+ */
+public enum CompressionLevel
+{
+ BEST("best"), DEFAULT("default"), FASTEST("fastest");
+
+ private final String m_tag;
+
+ private CompressionLevel(String tag)
+ {
+ m_tag = tag;
+ }
+
+ @Override
+ public String toString()
+ {
+ return m_tag + " compression";
+ }
+}
diff --git a/src/main/java/org/at4j/comp/bzip2/BZip2EncoderExecutorService.java b/src/main/java/org/at4j/comp/bzip2/BZip2EncoderExecutorService.java
new file mode 100644
index 0000000..8345d82
--- /dev/null
+++ b/src/main/java/org/at4j/comp/bzip2/BZip2EncoderExecutorService.java
@@ -0,0 +1,50 @@
+/* AT4J -- Archive file tools for Java -- http://www.at4j.org
+ * Copyright (C) 2009 Karl Gustafsson
+ *
+ * This file is a part of AT4J
+ *
+ * AT4J is free software: you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+package org.at4j.comp.bzip2;
+
+/**
+ * This interface identifies an executor service that is used to spread the
+ * encoding of bzip2 blocks over several threads. It can be used to speed up
+ * bzip2 encoding.
+ *
+ * The executor service spreads the work over all threads available to it. If a
+ * {@link BZip2OutputStream} submits more work when all threads are busy, the
+ * call blocks until the next thread becomes available.
+ *
+ * When the client is done using the executor, it must call {@link #shutdown()}
+ * to release all of its resources.
+ *
+ * An executor service instance can be had from the
+ * {@link BZip2OutputStream#createExecutorService(int)} method.
+ *
+ * This interface does not expose any methods except the {@link #shutdown()}
+ * method and there is no way of making a custom executor service
+ * implementation.
+ * @author Karl Gustafsson
+ * @since 1.1
+ */
+public interface BZip2EncoderExecutorService
+{
+ /**
+ * This method should be called when the executor service is no longer
+ * needed. It terminates all threads and releases all other resources
+ * associated with the executor.
+ */
+ void shutdown();
+}
diff --git a/src/main/java/org/at4j/comp/bzip2/BZip2EncoderExecutorServiceImpl.java b/src/main/java/org/at4j/comp/bzip2/BZip2EncoderExecutorServiceImpl.java
new file mode 100644
index 0000000..e41dfcf
--- /dev/null
+++ b/src/main/java/org/at4j/comp/bzip2/BZip2EncoderExecutorServiceImpl.java
@@ -0,0 +1,86 @@
+/* AT4J -- Archive file tools for Java -- http://www.at4j.org
+ * Copyright (C) 2009 Karl Gustafsson
+ *
+ * This file is a part of AT4J
+ *
+ * AT4J is free software: you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+package org.at4j.comp.bzip2;
+
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.RejectedExecutionHandler;
+import java.util.concurrent.ThreadPoolExecutor;
+import java.util.concurrent.TimeUnit;
+
+/**
+ * This is the only implementation of {@link BZip2EncoderExecutorService}. All
+ * objects that are using that interface assume that it is implemented by this
+ * class.
+ * @author Karl Gustafsson
+ * @since 1.1
+ */
+final class BZip2EncoderExecutorServiceImpl implements BZip2EncoderExecutorService
+{
+ /**
+ * This rejected execution handler shoehorns in a job in an
+ * {@link ExecutorService}'s job queue if it is rejected by the service.
+ * This requires that the service's job queue has an upper bound and that it
+ * blocks when trying to insert more elements than the bound.
+ * @author Karl Gustafsson
+ * @since 1.1
+ */
+ private static class ShoehornInJobRejectedExecutionHandler implements RejectedExecutionHandler
+ {
+ private static final ShoehornInJobRejectedExecutionHandler INSTANCE = new ShoehornInJobRejectedExecutionHandler();
+
+ public void rejectedExecution(Runnable r, ThreadPoolExecutor executor)
+ {
+ // System.out.print("Shoehorning... ");
+ try
+ {
+ executor.getQueue().put(r);
+ }
+ catch (InterruptedException e)
+ {
+ throw new RuntimeException(e);
+ }
+ // System.out.println("done");
+ }
+ }
+
+ private final ThreadPoolExecutor m_executor;
+ private final ErrorState m_errorState;
+
+ BZip2EncoderExecutorServiceImpl(int noThreads, ErrorState es)
+ {
+ m_executor = new ThreadPoolExecutor(noThreads, noThreads, 100, TimeUnit.SECONDS, new ArrayBlockingQueue(1), new EncodingThreadFactory(es), ShoehornInJobRejectedExecutionHandler.INSTANCE);
+ m_errorState = es;
+ }
+
+ ErrorState getErrorState()
+ {
+ return m_errorState;
+ }
+
+ void execute(BlockEncoderRunnable r)
+ {
+ m_executor.execute(r);
+ }
+
+ public void shutdown()
+ {
+ m_executor.shutdown();
+ }
+}
diff --git a/src/main/java/org/at4j/comp/bzip2/BZip2OutputStream.java b/src/main/java/org/at4j/comp/bzip2/BZip2OutputStream.java
new file mode 100644
index 0000000..de5f5dc
--- /dev/null
+++ b/src/main/java/org/at4j/comp/bzip2/BZip2OutputStream.java
@@ -0,0 +1,306 @@
+/* AT4J -- Archive file tools for Java -- http://www.at4j.org
+ * Copyright (C) 2009 Karl Gustafsson
+ *
+ * This file is a part of AT4J
+ *
+ * AT4J is free software: you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+package org.at4j.comp.bzip2;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.at4j.support.io.LittleEndianBitOutputStream;
+
+/**
+ * This is an {@link OutputStream} for bzip2 compressing data.
+ *
+ * This stream is not safe for concurrent access by several writing
+ * threads. A client must provide external synchronization to use this from
+ * several threads.
+ * @author Karl Gustafsson
+ * @since 1.1
+ * @see BZip2OutputStreamSettings
+ */
+public class BZip2OutputStream extends OutputStream
+{
+ private static final byte[] EOS_MAGIC = new byte[] { 0x17, 0x72, 0x45, 0x38, 0x50, (byte) 0x90 };
+
+ // This is used to generate unique hash codes for each created stream
+ // object.
+ private static final AtomicInteger HASH_CODE_GENERATOR = new AtomicInteger(0);
+
+ private final LittleEndianBitOutputStream m_wrapped;
+ // The block size in bytes
+ private final int m_blockSize;
+ // This may be null
+
+ // Data stream that writes to the block currently being filled with data.
+ private final BlockOutputStream m_blockOutputStream;
+ // If several threads are used to encode the data, this is used to write the
+ // encoded blocks in the right order.
+ private final EncodedBlockWriter m_encodedBlockWriter;
+ private final BZip2EncoderExecutorServiceImpl m_executorService;
+ private final boolean m_iCreatedExecutor;
+ private final int m_hashCode = HASH_CODE_GENERATOR.getAndIncrement();
+
+ private boolean m_closed;
+ private long m_pos = 0;
+
+ private static void writeFileHeader(OutputStream os, int blockSize) throws IOException
+ {
+ // File header
+ os.write('B');
+ os.write('Z');
+ // File version
+ os.write('h');
+ // Block size as a character. The ASCII code for 0 is 48.
+ os.write(blockSize + 48);
+ }
+
+ /**
+ * Create a new bzip2 compressing output stream with default settings.
+ * @param wrapped Compressed data is written to this stream.
+ * @throws IOException On errors writing the file header.
+ * @see #BZip2OutputStream(OutputStream, BZip2OutputStreamSettings)
+ */
+ public BZip2OutputStream(OutputStream wrapped) throws IOException
+ {
+ this(wrapped, new BZip2OutputStreamSettings());
+ }
+
+ /**
+ * Create a new bzip2 compressing output stream.
+ * @param wrapped Compressed data is written to this stream.
+ * @param settings Compression settings.
+ * @throws IOException On errors writing the file header.
+ * @see #BZip2OutputStream(OutputStream)
+ */
+ public BZip2OutputStream(OutputStream wrapped, BZip2OutputStreamSettings settings) throws IOException
+ {
+ // Null checks
+ wrapped.getClass();
+ settings.getClass();
+
+ m_wrapped = new LittleEndianBitOutputStream(wrapped);
+ // bzip2 uses 1kb == 1000b
+ m_blockSize = settings.getBlockSize() * 100 * 1000;
+
+ writeFileHeader(wrapped, settings.getBlockSize());
+
+ EncodingScratchpad sp;
+ if (settings.getExecutorService() != null)
+ {
+ // Use the supplied executor service
+ // There is only one allowed implementation for now.
+ m_executorService = (BZip2EncoderExecutorServiceImpl) settings.getExecutorService();
+ m_iCreatedExecutor = false;
+ m_encodedBlockWriter = new EncodedBlockWriter(m_wrapped);
+ // Each encoder thread has its own scratchpad
+ sp = null;
+ }
+ else if (settings.getNumberOfEncoderThreads() > 0)
+ {
+ // Use separate encoder threads.
+ m_executorService = new BZip2EncoderExecutorServiceImpl(settings.getNumberOfEncoderThreads(), new SingleObserverErrorState());
+ m_iCreatedExecutor = true;
+ m_encodedBlockWriter = new EncodedBlockWriter(m_wrapped);
+ // Each encoder thread has its own scratchpad
+ sp = null;
+ }
+ else
+ {
+ // Encode in the thread writing to the stream.
+ m_executorService = null;
+ m_iCreatedExecutor = false;
+ sp = new EncodingScratchpad();
+ m_encodedBlockWriter = null;
+ }
+
+ m_blockOutputStream = new BlockOutputStream(m_wrapped, m_blockSize, settings.getNumberOfHuffmanTreeRefinementIterations() , m_executorService, this, m_encodedBlockWriter, sp);
+ }
+
+ private void assertNotClosed() throws IOException
+ {
+ if (m_closed)
+ {
+ throw new IOException("This stream is closed");
+ }
+ }
+
+ private void checkErrorState() throws IOException, RuntimeException
+ {
+ if (m_executorService != null)
+ {
+ m_executorService.getErrorState().checkAndClearErrors(this);
+ }
+ }
+
+ private void debug(String msg)
+ {
+
+ }
+
+ private void writeEosBlock() throws IOException
+ {
+ // Write the end of stream magic
+ for (int i = 0; i < EOS_MAGIC.length; i++)
+ {
+ m_wrapped.writeBitsLittleEndian(EOS_MAGIC[i] & 0xFF, 8);
+ }
+ // Write file checksum
+ m_wrapped.writeBitsLittleEndian(m_blockOutputStream.getFileChecksum(), 32);
+ m_wrapped.padToByteBoundary();
+ }
+
+ @Override
+ public void write(int b) throws IOException
+ {
+ assertNotClosed();
+ checkErrorState();
+
+ m_pos++;
+ m_blockOutputStream.write(b & 0xFF);
+ }
+
+ @Override
+ public void write(byte[] data) throws IOException
+ {
+ assertNotClosed();
+ checkErrorState();
+
+ m_pos += data.length;
+ m_blockOutputStream.write(data);
+ }
+
+ @Override
+ public void write(byte[] data, int offset, int len) throws IOException, IndexOutOfBoundsException
+ {
+ assertNotClosed();
+ checkErrorState();
+
+ if (offset < 0)
+ {
+ throw new IndexOutOfBoundsException("Offset: " + offset);
+ }
+ if (len < 0)
+ {
+ throw new IndexOutOfBoundsException("Length: " + len);
+ }
+ if (offset + len > data.length)
+ {
+ throw new IndexOutOfBoundsException("Offset: " + offset + " + Length: " + len + " > length of data: " + data.length);
+ }
+
+ m_pos += len;
+ m_blockOutputStream.write(data, offset, len);
+ }
+
+ @Override
+ public void close() throws IOException
+ {
+ checkErrorState();
+
+ if (!m_closed)
+ {
+ // This writes out any remaining run length encoding data and closes
+ // the block output stream.
+ m_blockOutputStream.close();
+
+ if ((m_pos > 0) && (m_encodedBlockWriter != null))
+ {
+ // Wait for all blocks to be written.
+ try
+ {
+ m_encodedBlockWriter.waitFor();
+ }
+ catch (InterruptedException e)
+ {
+ // Repackage
+ throw new IOException("Interrupted. The output file is most likely corrupted.");
+ }
+ checkErrorState();
+ }
+
+ writeEosBlock();
+
+ m_wrapped.close();
+
+ debug("Original size: " + m_pos + ", compressed size: " + m_wrapped.getNumberOfBytesWritten());
+
+ if (m_iCreatedExecutor && (m_executorService != null))
+ {
+ m_executorService.shutdown();
+ }
+ m_closed = true;
+ super.close();
+ }
+ }
+
+ @Override
+ public int hashCode()
+ {
+ return m_hashCode;
+ }
+
+ @Override
+ public boolean equals(Object o)
+ {
+ return this == o;
+ }
+
+ /**
+ * Close the stream if the client has been sloppy about it.
+ */
+ @Override
+ protected void finalize() throws Throwable
+ {
+ close();
+ super.finalize();
+ }
+
+ /**
+ * Create a {@link BZip2EncoderExecutorService} that can be shared between
+ * several {@link BZip2OutputStream}:s to spread the bzip2 encoding work
+ * over several threads. The created executor service can be passed to the
+ * {@link BZip2OutputStream} constructor in a
+ * {@link BZip2OutputStreamSettings} object.
+ * @param noThreads The number of threads available to the executor.
+ * @return The executor service.
+ */
+ public static BZip2EncoderExecutorService createExecutorService(int noThreads)
+ {
+ return new BZip2EncoderExecutorServiceImpl(noThreads, new MultipleObserverErrorState());
+ }
+
+ /**
+ * Create a {@link BZip2EncoderExecutorService} that can be shared between
+ * several {@link BZip2OutputStream}:s to spread the bzip2 encoding work
+ * over several threads. The created executor service can be passed to the
+ * {@link BZip2OutputStream} constructor in a
+ * {@link BZip2OutputStreamSettings} object.
+ *
+ * The created executor will have as many threads available to it as there
+ * are CPU:s available to the JVM.
+ * @return The executor service.
+ */
+ public static BZip2EncoderExecutorService createExecutorService()
+ {
+ return createExecutorService(Runtime.getRuntime().availableProcessors());
+ }
+}
diff --git a/src/main/java/org/at4j/comp/bzip2/BZip2OutputStreamSettings.java b/src/main/java/org/at4j/comp/bzip2/BZip2OutputStreamSettings.java
new file mode 100644
index 0000000..fd35e27
--- /dev/null
+++ b/src/main/java/org/at4j/comp/bzip2/BZip2OutputStreamSettings.java
@@ -0,0 +1,223 @@
+/* AT4J -- Archive file tools for Java -- http://www.at4j.org
+ * Copyright (C) 2009 Karl Gustafsson
+ *
+ * This file is a part of AT4J
+ *
+ * AT4J is free software: you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+package org.at4j.comp.bzip2;
+
+import org.at4j.support.lang.At4JException;
+
+/**
+ * This object contains settings for the {@link BZip2OutputStream}.
+ *
+ * When created, this object contains the default settings. Modify the settings
+ * by calling setter methods on this object.
+ * @author Karl Gustafsson
+ * @since 1.1
+ * @see BZip2OutputStream
+ */
+public class BZip2OutputStreamSettings implements Cloneable
+{
+ /**
+ * The minimum size of an encoded data block in hundreds of kilobytes. Using
+ * a small block size gives faster but worse compression.
+ */
+ public static final int MIN_BLOCK_SIZE = 1;
+
+ /**
+ * The maximum size of an encoded data block in hundreds of kilobytes. Using
+ * a large block size gives slower but better compression.
+ */
+ public static final int MAX_BLOCK_SIZE = 9;
+
+ /**
+ * The default block size.
+ */
+ public static final int DEFAULT_BLOCK_SIZE = MAX_BLOCK_SIZE;
+
+ /**
+ * The default number of Huffman tree refinement iterations. By having more
+ * tree refinement iterations the compression gets better, but as the number
+ * is increased the returns are diminishing.
+ */
+ public static final int DEFAULT_NO_OF_HUFFMAN_TREE_REFINEMENT_ITERATIONS = 5;
+
+ /**
+ * The default number of encoder threads.
+ */
+ public static final int DEFAULT_NO_OF_ENCODER_THREADS = 0;
+
+ private int m_blockSize = DEFAULT_BLOCK_SIZE;
+ private int m_numberOfHuffmanTreeRefinementIterations = DEFAULT_NO_OF_HUFFMAN_TREE_REFINEMENT_ITERATIONS;
+ private int m_numberOfEncoderThreads = DEFAULT_NO_OF_ENCODER_THREADS;
+ private BZip2EncoderExecutorService m_executorService;
+
+ /**
+ * Set the size of compressed data blocks. A high block size gives good but
+ * slow compression. A low block size gives worse but faster compression.
+ *
+ * The default block size is 9 (the highest permitted value).
+ * @param bs The block size in hundreds of kilobytes. This should be between
+ * 1 and 9 (inclusive).
+ * @return {@code this}
+ * @throws IllegalArgumentException If the block size is not in the
+ * permitted range.
+ */
+ public BZip2OutputStreamSettings setBlockSize(int bs) throws IllegalArgumentException
+ {
+ if (bs < MIN_BLOCK_SIZE || bs > MAX_BLOCK_SIZE)
+ {
+ throw new IllegalArgumentException("Invalid block size " + bs + ". It must be between " + MIN_BLOCK_SIZE + " and " + MAX_BLOCK_SIZE + " (inclusive)");
+ }
+ m_blockSize = bs;
+ return this;
+ }
+
+ /**
+ * Get the block size for a compressed data block.
+ * @return The block size for a compressed data block.
+ */
+ public int getBlockSize()
+ {
+ return m_blockSize;
+ }
+
+ /**
+ * Set the number of tree refinement iterations that are run when creating
+ * Huffman trees for each compressed data block.
+ *
+ * A higher value for this parameter should give better but slower
+ * compression. As the value increases the returns are diminishing.
+ *
+ * The default value is five refinement iterations.
+ * @param no The number of Huffman tree refinement iterations. This should
+ * be a positive integer larger than zero.
+ * @return {@code this}
+ * @throws IllegalArgumentException If the number is not a positive integer
+ * larger than zero.
+ */
+ public BZip2OutputStreamSettings setNumberOfHuffmanTreeRefinementIterations(int no) throws IllegalArgumentException
+ {
+ if (no < 1)
+ {
+ throw new IllegalArgumentException("Invalid value " + no + ". It must be greater than zero");
+ }
+ m_numberOfHuffmanTreeRefinementIterations = no;
+ return this;
+ }
+
+ /**
+ * Get the number of Huffman tree refinement iterations.
+ * @return The number of Huffman tree refinement iterations.
+ */
+ public int getNumberOfHuffmanTreeRefinementIterations()
+ {
+ return m_numberOfHuffmanTreeRefinementIterations;
+ }
+
+ /**
+ * Set a for logging diagnostic output to. Output is
+ * logged to the debug and trace levels.
+ *
+ * By default no log adapter is used and hence no diagnostic output is
+ * logged.
+ * @param la A log adapter.
+ * @return {@code this}
+ */
+ public BZip2OutputStreamSettings setLogAdapter(Object la)
+ {
+ return this;
+ }
+
+
+ /**
+ * Set the number of encoder threads used for bzip2 compressing data. bzip2
+ * encoding is CPU intensive and giving the encoder more threads to work
+ * with can drastically shorten the encoding time. The drawback is that the
+ * memory consumption grows since each encoder thread must keep its data in
+ * memory.
+ *
+ * The default number of encoder threads is zero, which means that the
+ * thread that is writing the data to the {@link BZip2OutputStream} will be
+ * used for the encoding.
+ *
+ * For the shortest encoding time, use as many threads as there are
+ * available CPU:s in the system.
+ * @param no The number of encoder threads to use. If this is set to {@code
+ * 0}, the encoding will be done in the thread writing to the stream.
+ * @return {@code this}
+ * @throws IllegalArgumentException If {@code no} is negative.
+ * @see #setExecutorService(BZip2EncoderExecutorService)
+ */
+ public BZip2OutputStreamSettings setNumberOfEncoderThreads(int no) throws IllegalArgumentException
+ {
+ if (no < 0)
+ {
+ throw new IllegalArgumentException("Invalid number of encoder threads " + no + ". The number must be zero or greater");
+ }
+
+ m_numberOfEncoderThreads = no;
+ return this;
+ }
+
+ public int getNumberOfEncoderThreads()
+ {
+ return m_numberOfEncoderThreads;
+ }
+
+ /**
+ * Set an executor service that the {@link BZip2OutputStream} will use to
+ * spread the encoding over several threads. This executor can be shared
+ * among several {@link BZip2OutputStream} objects.
+ *
+ * If an executor service is set using this method, all threads that are
+ * available to the executor is used for the encoding and any value set
+ * using {@link #setNumberOfEncoderThreads(int)} is ignored.
+ *
+ * An executor service is created using the
+ * {@link BZip2OutputStream#createExecutorService()} or the
+ * {@link BZip2OutputStream#createExecutorService(int)} method.
+ * @param executorService The executor service.
+ * @return {@code this}
+ * @see #setNumberOfEncoderThreads(int)
+ */
+ public BZip2OutputStreamSettings setExecutorService(BZip2EncoderExecutorService executorService)
+ {
+ m_executorService = executorService;
+ return this;
+ }
+
+ public BZip2EncoderExecutorService getExecutorService()
+ {
+ return m_executorService;
+ }
+
+ /**
+ * Make a copy of this object.
+ */
+ @Override
+ public BZip2OutputStreamSettings clone()
+ {
+ try
+ {
+ return (BZip2OutputStreamSettings) super.clone();
+ }
+ catch (CloneNotSupportedException e)
+ {
+ throw new At4JException("Bug", e);
+ }
+ }
+}
diff --git a/src/main/java/org/at4j/comp/bzip2/Block.java b/src/main/java/org/at4j/comp/bzip2/Block.java
new file mode 100644
index 0000000..928cd7a
--- /dev/null
+++ b/src/main/java/org/at4j/comp/bzip2/Block.java
@@ -0,0 +1,29 @@
+/* AT4J -- Archive file tools for Java -- http://www.at4j.org
+ * Copyright (C) 2009 Karl Gustafsson
+ *
+ * This file is a part of AT4J
+ *
+ * AT4J is free software: you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+package org.at4j.comp.bzip2;
+
+/**
+ * Interface identifying a bzip2 data block. Used by the {@link BlockDecoder}.
+ * @author Karl Gustafsson
+ * @since 1.1
+ */
+interface Block
+{
+ // Nothing
+}
diff --git a/src/main/java/org/at4j/comp/bzip2/BlockDecoder.java b/src/main/java/org/at4j/comp/bzip2/BlockDecoder.java
new file mode 100644
index 0000000..adb741a
--- /dev/null
+++ b/src/main/java/org/at4j/comp/bzip2/BlockDecoder.java
@@ -0,0 +1,422 @@
+/* AT4J -- Archive file tools for Java -- http://www.at4j.org
+ * Copyright (C) 2009 Karl Gustafsson
+ *
+ * This file is a part of AT4J
+ *
+ * AT4J is free software: you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+package org.at4j.comp.bzip2;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.at4j.support.comp.ByteMoveToFront;
+import org.at4j.support.comp.IntMoveToFront;
+import org.at4j.support.io.LittleEndianBitInputStream;
+import org.at4j.support.lang.At4JException;
+import org.at4j.support.lang.UnsignedInteger;
+
+/**
+ * This is used by the {@link BZip2InputStream} to decode data blocks.
+ * @author Karl Gustafsson
+ * @since 1.1
+ */
+final class BlockDecoder
+{
+ // The magic number identifying a block of compressed data
+ private static final byte[] COMPRESSED_BLOCK_MAGIC = new byte[] { (byte) 0x31, (byte) 0x41, (byte) 0x59, (byte) 0x26, (byte) 0x53, (byte) 0x59 };
+ // The magic number identifying the end of stream block
+ private static final byte[] EOS_BLOCK_MAGIC = new byte[] { (byte) 0x17, (byte) 0x72, (byte) 0x45, (byte) 0x38, (byte) 0x50, (byte) 0x90 };
+
+ // The number of symbols to read from each Huffman tree before switching
+ private static final int SYMBOLS_TO_READ_FROM_EACH_TREE = 50;
+
+ // The symbol value of the special RUNA symbol.
+ private static final int RUNA_SYMBOL = 0;
+ // The symbol value of the special RUNB symbol.
+ private static final int RUNB_SYMBOL = 1;
+
+ private static final int MAX_NO_OF_MTF_SYMBOLS = 258;
+
+ private static final byte[] INITIAL_MOVE_TO_FRONT_ALPHABET = new byte[MAX_NO_OF_MTF_SYMBOLS];
+ static
+ {
+ for (int i = 0; i < MAX_NO_OF_MTF_SYMBOLS; i++)
+ {
+ INITIAL_MOVE_TO_FRONT_ALPHABET[i] = (byte) i;
+ }
+ }
+
+ private final LittleEndianBitInputStream m_in;
+ private final int m_blockSize;
+
+ // Data read from the block header
+
+ // Block checksum (CRC)
+ private int m_readBlockChecksum;
+ // The pointer to the original data used in the BW transform
+ private int m_originalDataPointer;
+ // The Huffman trees used for decompression
+ private HighValueBranchHuffmanTree[] m_huffmanTrees;
+ // The EOB (End Of Block) symbol index.
+ private int m_endOfBlockSymbol;
+ // The number of times that the Huffman trees are switched in the input.
+ // The trees are switched every 50 bytes.
+ private int m_numberOfTimesHuffmanTreesAreSwitched;
+ private int[] m_treeUse;
+ // Mapping between symbol values and byte values.
+ private byte[] m_symbolSequenceNos;
+ // Frequency of each byte in the pre-BW data
+ private int[] m_byteFrequencies;
+
+ // State variables
+
+ // The number of the currently selected Huffman tree
+ private HighValueBranchHuffmanTree m_curTree;
+ // The number of symbols left to read from the current Huffman tree
+ private int m_symbolsLeftToReadFromCurTree;
+ // The current number of Huffman tree switches
+ private int m_switchNo;
+ // A counter for the number of bytes decoded in this block.
+ private int m_noBytesDecoded;
+ private ByteMoveToFront m_mtfTransformer;
+ // This will hold the decoded data (before the Burrows Wheeler decoding)
+ private final byte[] m_decoded;
+
+ BlockDecoder(LittleEndianBitInputStream in, int blockSize)
+ {
+ m_in = in;
+ m_blockSize = blockSize;
+ m_decoded = new byte[blockSize];
+ }
+
+ private void throwIOException(String msg) throws IOException
+ {
+ throw new IOException(msg + ". Position in input stream: " + m_in.getNumberOfBytesRead());
+ }
+
+ private void checkInterrupted() throws InterruptedException
+ {
+ if (Thread.interrupted())
+ {
+ throw new InterruptedException();
+ }
+ }
+
+ private void trace(String s)
+ {
+ System.out.println(s);
+ }
+
+ static HighValueBranchHuffmanTree decodeHuffmanTree(final int totalNumberOfSymbols, final LittleEndianBitInputStream in) throws IOException
+ {
+ int[] symbolLengths = new int[totalNumberOfSymbols];
+
+ // Starting bit length for Huffman deltas in this tree
+ int currentBitLength = in.readBits(5);
+ if (currentBitLength > 20)
+ {
+ throw new IOException("Invalid starting bit length for Huffman deltas: " + currentBitLength + ". Must be <= 20");
+ }
+
+ // Initialize min and max lengths per tree with values that
+ // will certainly be overwritten.
+ int minBitLengthPerTree = 20;
+ int maxBitLengthPerTree = 0;
+
+ for (int j = 0; j < totalNumberOfSymbols; j++)
+ {
+ while (in.readBit())
+ {
+ currentBitLength += in.readBit() ? -1 : 1;
+ if ((currentBitLength < 1) || (currentBitLength > 20))
+ {
+ throw new IOException("Invalid bit length " + currentBitLength);
+ }
+ }
+ symbolLengths[j] = currentBitLength;
+
+ if (currentBitLength < minBitLengthPerTree)
+ {
+ minBitLengthPerTree = currentBitLength;
+ }
+ if (currentBitLength > maxBitLengthPerTree)
+ {
+ maxBitLengthPerTree = currentBitLength;
+ }
+ }
+ return new HighValueBranchHuffmanTree(symbolLengths, minBitLengthPerTree, maxBitLengthPerTree, false);
+ }
+
+ private void readCompressedBlockHeader() throws IOException
+ {
+ byte[] barr = new byte[4];
+
+ // Block checksum
+ m_readBlockChecksum = (int) UnsignedInteger.fromLittleEndianByteArrayToLong(m_in.readBytes(barr, 0, 4), 0);
+
+ // Randomized block?
+ if (m_in.readBit())
+ {
+ throwIOException("Randomized block mode is not supported");
+ }
+
+ // Starting pointer into BWT
+ m_in.readBytes(barr, 1, 3);
+ barr[0] = 0;
+ m_originalDataPointer = (int) UnsignedInteger.fromLittleEndianByteArrayToLong(barr, 0);
+ if (m_originalDataPointer > m_blockSize)
+ {
+ throw new IOException("Invalid starting pointer " + m_originalDataPointer + ". It must be less than the block size " + m_blockSize);
+ }
+
+ // Huffman used codes
+ boolean[] usedSymbols = new boolean[256];
+ int numberOfUsedSymbols = 0;
+
+ boolean[] inUseBlocks = new boolean[16];
+ for (int i = 0; i < 16; i++)
+ {
+ inUseBlocks[i] = m_in.readBit();
+ }
+ for (int i = 0; i < 16; i++)
+ {
+ if (inUseBlocks[i])
+ {
+ for (int j = 0; j < 16; j++)
+ {
+ if (m_in.readBit())
+ {
+ usedSymbols[i * 16 + j] = true;
+ numberOfUsedSymbols++;
+ }
+ }
+ }
+ }
+ if (numberOfUsedSymbols == 0)
+ {
+ throwIOException("No symbols used in table");
+ }
+
+ // Create a mapping for the sequence numbers of all used bytes
+ m_symbolSequenceNos = new byte[numberOfUsedSymbols];
+ int useIndex = 0;
+ for (int i = 0; i < 256; i++)
+ {
+ if (usedSymbols[i])
+ {
+ m_symbolSequenceNos[useIndex++] = (byte) (i & 0xFF);
+ }
+ }
+ assert useIndex == numberOfUsedSymbols;
+
+ m_byteFrequencies = new int[256];
+
+ // The number of Huffman trees to use
+ int numberOfHuffmanTrees = m_in.readBits(3);
+ if (numberOfHuffmanTrees < 2 || numberOfHuffmanTrees > 6)
+ {
+ throwIOException("Invalid number of Huffman trees " + numberOfHuffmanTrees + ". Must be between 2 and 6 (inclusive)");
+ }
+
+ // The number of times the trees to use are swapped in the input.
+ // The trees are swapped each 50 bytes.
+ m_numberOfTimesHuffmanTreesAreSwitched = m_in.readBitsLittleEndian(15);
+ if (m_numberOfTimesHuffmanTreesAreSwitched < 1)
+ {
+ throwIOException("Invalid number of times the Huffman trees are switched in the input: " + m_numberOfTimesHuffmanTreesAreSwitched);
+ }
+
+ // Zero-terminated bit runs for each tree switch
+ int[] treeUseMtf = new int[m_numberOfTimesHuffmanTreesAreSwitched];
+ for (int i = 0; i < m_numberOfTimesHuffmanTreesAreSwitched; i++)
+ {
+ treeUseMtf[i] = 0;
+ while (m_in.readBit())
+ {
+ treeUseMtf[i]++;
+ }
+ if (treeUseMtf[i] > numberOfHuffmanTrees)
+ {
+ throwIOException("Invalid Huffman tree use MTF " + treeUseMtf[i] + ". Must be less than the number of Huffman trees, " + numberOfHuffmanTrees);
+ }
+ }
+
+ // Decode the tree use MTF values
+ m_treeUse = new int[m_numberOfTimesHuffmanTreesAreSwitched];
+ // The "alphabet" for the MTF encoding -- the indices of the different
+ // tree uses.
+ int[] treeUseIndices = new int[numberOfHuffmanTrees];
+ for (int i = 0; i < numberOfHuffmanTrees; i++)
+ {
+ treeUseIndices[i] = i;
+ }
+ new IntMoveToFront(treeUseIndices).decode(treeUseMtf, m_treeUse);
+
+ // Settings for the Huffman trees
+
+ // The total number of used symbols is the value we calculated above - 1
+ // + RUNA, RUNB and an end of stream marker.
+ int totalNumberOfSymbols = numberOfUsedSymbols + 2;
+ m_huffmanTrees = new HighValueBranchHuffmanTree[numberOfHuffmanTrees];
+ for (int i = 0; i < numberOfHuffmanTrees; i++)
+ {
+ m_huffmanTrees[i] = decodeHuffmanTree(totalNumberOfSymbols, m_in);
+ }
+
+ // The symbol value for the end of the data block.
+ m_endOfBlockSymbol = totalNumberOfSymbols - 1;
+ }
+
+ private void selectNewHuffmanTree() throws IOException
+ {
+ if (m_switchNo >= m_numberOfTimesHuffmanTreesAreSwitched)
+ {
+ throwIOException("One Huffman tree switch too many: " + m_switchNo);
+ }
+ m_symbolsLeftToReadFromCurTree = SYMBOLS_TO_READ_FROM_EACH_TREE;
+ m_curTree = m_huffmanTrees[m_treeUse[m_switchNo]];
+ m_switchNo++;
+ }
+
+ private int readSymbol() throws IOException
+ {
+ if (m_symbolsLeftToReadFromCurTree == 0)
+ {
+ selectNewHuffmanTree();
+ }
+ final int symbol = m_curTree.readNext(m_in);
+ m_symbolsLeftToReadFromCurTree--;
+ return symbol;
+ }
+
+ private void decodeSingleByte(final int symbolMtf) throws IOException
+ {
+ // Move To Front decode the symbol
+ final int byteIndex = m_mtfTransformer.decode(symbolMtf - 1) & 0xFF;
+
+ final byte value = m_symbolSequenceNos[byteIndex];
+ m_decoded[m_noBytesDecoded++] = value;
+ m_byteFrequencies[value & 0xFF]++;
+ }
+
+ // returns the next symbol
+ private int handleRunaAndRunb(int symbol) throws IOException
+ {
+ int n = 1;
+ int multiplier = 0;
+ while (symbol == RUNA_SYMBOL || symbol == RUNB_SYMBOL)
+ {
+ if (symbol == RUNA_SYMBOL)
+ {
+ multiplier += n;
+ }
+ else
+ {
+ multiplier += 2 * n;
+ }
+ // Multiply n with 2
+ n <<= 1;
+ symbol = readSymbol();
+ }
+
+ // The repeated value is at the front of the MTF list
+ final int byteIndex = m_mtfTransformer.decode(0) & 0xFF;
+ final byte value = m_symbolSequenceNos[byteIndex];
+ if (multiplier == 1)
+ {
+ m_decoded[m_noBytesDecoded++] = value;
+ m_byteFrequencies[value & 0xFF]++;
+ }
+ else
+ {
+ Arrays.fill(m_decoded, m_noBytesDecoded, m_noBytesDecoded + multiplier, value);
+ m_noBytesDecoded += multiplier;
+ m_byteFrequencies[value & 0xFF] += multiplier;
+ }
+ return symbol;
+ }
+
+ CompressedDataBlock readCompressedDataBlock() throws IOException, InterruptedException
+ {
+ readCompressedBlockHeader();
+
+ int symbol = readSymbol();
+
+ while (true)
+ {
+ checkInterrupted();
+
+ if (symbol == RUNA_SYMBOL || symbol == RUNB_SYMBOL)
+ {
+ symbol = handleRunaAndRunb(symbol);
+ }
+ else if (symbol == m_endOfBlockSymbol)
+ {
+ BurrowsWheelerDecoder bwd = new BurrowsWheelerDecoder(m_decoded, m_noBytesDecoded, m_byteFrequencies, m_originalDataPointer);
+ return new CompressedDataBlock(new RLEDecodingInputStream(bwd.decode(), m_readBlockChecksum), m_readBlockChecksum);
+ }
+ else
+ {
+ decodeSingleByte(symbol);
+ symbol = readSymbol();
+ }
+ }
+ }
+
+ private void initDecoderState()
+ {
+ // Initialize the MTF alphabet
+ final byte[] moveToFrontAlphabet = new byte[MAX_NO_OF_MTF_SYMBOLS];
+ System.arraycopy(INITIAL_MOVE_TO_FRONT_ALPHABET, 0, moveToFrontAlphabet, 0, MAX_NO_OF_MTF_SYMBOLS);
+ m_mtfTransformer = new ByteMoveToFront(moveToFrontAlphabet);
+ m_curTree = null;
+ m_symbolsLeftToReadFromCurTree = 0;
+ m_switchNo = 0;
+ m_noBytesDecoded = 0;
+ }
+
+ Block getNextBlock() throws IOException
+ {
+ initDecoderState();
+
+ byte[] barr = new byte[6];
+ m_in.readBytes(barr, 0, 6);
+ if (Arrays.equals(COMPRESSED_BLOCK_MAGIC, barr))
+ {
+ trace("Found block of compressed data");
+ try
+ {
+ return readCompressedDataBlock();
+ }
+ catch (InterruptedException e)
+ {
+ throw new At4JException(e);
+ }
+ }
+ else if (Arrays.equals(EOS_BLOCK_MAGIC, barr))
+ {
+ trace("Found end of stream block");
+ m_in.readBytes(barr, 0, 4);
+ int readCrc32 = (int) UnsignedInteger.fromLittleEndianByteArrayToLong(barr, 0);
+ return new EosBlock(readCrc32);
+ }
+ else
+ {
+ throwIOException("Invalid block header " + Arrays.toString(barr) + ". Expected compressed data block or end of stream block");
+ // Never reached
+ return null;
+ }
+ }
+}
diff --git a/src/main/java/org/at4j/comp/bzip2/BlockEncodedCallback.java b/src/main/java/org/at4j/comp/bzip2/BlockEncodedCallback.java
new file mode 100644
index 0000000..c1db007
--- /dev/null
+++ b/src/main/java/org/at4j/comp/bzip2/BlockEncodedCallback.java
@@ -0,0 +1,54 @@
+/* AT4J -- Archive file tools for Java -- http://www.at4j.org
+ * Copyright (C) 2009 Karl Gustafsson
+ *
+ * This file is a part of AT4J
+ *
+ * AT4J is free software: you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+package org.at4j.comp.bzip2;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+
+import org.at4j.support.io.BitOutput;
+
+/**
+ * This callback is called by the {@link BlockEncoder} when it has encoded its
+ * block.
+ * @author Karl Gustafsson
+ * @since 1.1
+ */
+final class BlockEncodedCallback
+{
+ private final int m_blockNo;
+ private final EncodedBlockWriter m_writer;
+ private final ByteArrayOutputStream m_byteOut;
+ private final BitOutput m_bitOut;
+
+ BlockEncodedCallback(final int blockNo, final ByteArrayOutputStream byteOut, final BitOutput bitOut, final EncodedBlockWriter writer)
+ {
+ m_blockNo = blockNo;
+ m_writer = writer;
+ m_byteOut = byteOut;
+ m_bitOut = bitOut;
+ }
+
+ /**
+ * This is called by the {@link BlockEncoder} when it is done.
+ */
+ void reportBlockDone() throws IOException
+ {
+ m_writer.writeBlock(m_blockNo, new EncodedBlockData(m_byteOut.toByteArray(), m_bitOut.getNumberOfBitsInUnfinishedByte(), m_bitOut.getUnfinishedByte()));
+ }
+}
diff --git a/src/main/java/org/at4j/comp/bzip2/BlockEncoder.java b/src/main/java/org/at4j/comp/bzip2/BlockEncoder.java
new file mode 100644
index 0000000..e7b04ce
--- /dev/null
+++ b/src/main/java/org/at4j/comp/bzip2/BlockEncoder.java
@@ -0,0 +1,893 @@
+/* AT4J -- Archive file tools for Java -- http://www.at4j.org
+ * Copyright (C) 2009 Karl Gustafsson
+ *
+ * This file is a part of AT4J
+ *
+ * AT4J is free software: you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+package org.at4j.comp.bzip2;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.at4j.comp.bzip2.BurrowsWheelerEncoder.BurrowsWheelerEncodingResult;
+import org.at4j.support.comp.IntMoveToFront;
+import org.at4j.support.io.BitOutput;
+
+/**
+ * This is used by the thread encoding a bzip2 block.
+ * @author Karl Gustafsson
+ * @since 1.1
+ */
+final class BlockEncoder
+{
+ private static final byte[] BLOCK_MAGIC = new byte[] { 0x31, 0x41, 0x59, 0x26, 0x53, 0x59 };
+
+ // The maximum Huffman tree depth
+ private static final int MAX_HUFFMAN_BIT_LENGTH = 17;
+
+ // The values of the RUNA and RUNB symbols
+ private static final int RUNA_SYMBOL = 0;
+ private static final int RUNB_SYMBOL = 1;
+
+ private static final int MIN_NO_OF_HUFFMAN_TREES = 2;
+ static final int MAX_NO_OF_HUFFMAN_TREES = 6;
+
+ // The maximum number of different MTF symbols: 256 bytes + RUNA + RUNB +
+ // EOB - one byte (the first symbol does not have to be encoded thanks to
+ // MTF and RLE)
+ static final int MAX_NO_OF_MTF_SYMBOLS = 258;
+
+ // Write 50 symbols, then swap Huffman trees.
+ static final int NO_OF_SYMBOLS_PER_SEGMENT = 50;
+
+ // Categories used when optimizing Huffman trees
+ // For each tree length, in which category does a segment belong depending
+ // on its encoded length percentage?
+ static final int[][] CATEGORY_PER_NO_OF_TREES_AND_PERCENTAGE = new int[][] {
+ // Two trees: cutoff at 30%
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+ // Three trees: cutoff at 18% and 45%
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
+ // Four trees: cutoff at 15%, 30% and 55%
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
+ // Five trees: cutoff at 12%, 25%, 40% and 60%
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 },
+ // Six trees: cutoff at 8%, 25%, 36%, 51% and 63%
+ { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 } };
+
+ private static final byte[] INITIAL_MTF_ALPHABET = new byte[MAX_NO_OF_MTF_SYMBOLS];
+ static
+ {
+ for (int i = 0; i < INITIAL_MTF_ALPHABET.length; i++)
+ {
+ INITIAL_MTF_ALPHABET[i] = (byte) (i & 0xFF);
+ }
+ }
+
+ private final byte[] m_block;
+ private final int m_blockNo;
+ private final int m_blockSize;
+ private final int m_blockChecksum;
+ // Bit flags indicating which bytes that occur at least once in this block
+ private final boolean[] m_seenDifferentBytes;
+ // The number of different bytes seen in this block
+ private final int m_numberOfSeenDifferentBytes;
+ private final int m_numberOfHuffmanTreeRefinementIterations;
+ // Sink to write encoded data to.
+ private final BitOutput m_out;
+ // This callback is called when the block encoder is done. It may be null.
+ private final BlockEncodedCallback m_blockEncoderCallback;
+
+ // This is set by the encoding thread before calling encode
+ private EncodingScratchpad m_scratchpad;
+
+ BlockEncoder(final byte[] block, final int blockNo, final int blockSize, final int blockChecksum, final boolean[] seenDifferentBytes, final int numberOfSeenDifferentBytes, final int numberOfHuffmanTreeRefinementIterations,
+ final BitOutput out, final BlockEncodedCallback bec)
+ {
+ m_block = block;
+ m_blockNo = blockNo;
+ m_blockSize = blockSize;
+ m_blockChecksum = blockChecksum;
+ m_seenDifferentBytes = seenDifferentBytes;
+ m_numberOfSeenDifferentBytes = numberOfSeenDifferentBytes;
+ m_numberOfHuffmanTreeRefinementIterations = numberOfHuffmanTreeRefinementIterations;
+ m_out = out;
+ m_blockEncoderCallback = bec;
+ }
+
+ void setScratchpad(EncodingScratchpad sp)
+ {
+ m_scratchpad = sp;
+ }
+
+ /**
+ * Get the seen byte values in the current block.
+ */
+ private byte[] getSeenByteValues()
+ {
+ byte[] res = new byte[m_numberOfSeenDifferentBytes];
+ int j = 0;
+ for (int i = 0; i < 256; i++)
+ {
+ if (m_seenDifferentBytes[i])
+ {
+ res[j++] = (byte) (i & 0xFF);
+ }
+ }
+ assert j == m_numberOfSeenDifferentBytes;
+ return res;
+ }
+
+ /**
+ * Add RUNA and RUNB symbols to {@code res} at {@code outIndex} to represent
+ * {@code no} repetitions of the previous symbol.
+ *
+ * This method is declared package-protected for the unit tests.
+ * @return The number of symbols added. outIndex should be incremented by
+ * this value by the caller.
+ */
+ static int addRunaAndRunb(int[] res, int outIndex, int no)
+ {
+ int noWritten = 0;
+ while (no > 0)
+ {
+ switch (no % 2)
+ {
+ case 1:
+ res[outIndex + noWritten++] = RUNA_SYMBOL;
+ no -= 1;
+ break;
+ case 0:
+ res[outIndex + noWritten++] = RUNB_SYMBOL;
+ no -= 2;
+ break;
+ default:
+ // Should not occur unless we use relativistic arithmetic or
+ // something...
+ throw new RuntimeException();
+ }
+ no >>>= 1;
+ }
+ return noWritten;
+ }
+
+ /**
+ * Create a mapping between symbols and their index numbers in the array of
+ * symbols.
+ * @param symbols The symbols.
+ * @return An array containing the index number for each symbol that occurs
+ * in {@code symbols}.
+ */
+ private byte[] createSequenceMap(byte[] symbols)
+ {
+ byte[] res = m_scratchpad.m_sequenceMap;
+ byte index = 0;
+ for (int i = 0; i < symbols.length; i++)
+ {
+ res[symbols[i] & 0xFF] = index++;
+ }
+ return res;
+ }
+
+ private static class MTFAndRLEResult
+ {
+ // The encoded data as MTF symbols.
+ private final int[] m_encodedData;
+ private final int m_dataLen;
+ private final int m_noSeenDifferentSymbols;
+
+ private MTFAndRLEResult(int[] symbols, int dataLen, int noSeenDifferentSymbols)
+ {
+ m_encodedData = symbols;
+ m_dataLen = dataLen;
+ m_noSeenDifferentSymbols = noSeenDifferentSymbols;
+ }
+ }
+
+ /**
+ * Run MTF and RLE encoding of the data in {@code data}.
+ * @param data The data to encode.
+ * @param dataLen The data length.
+ * @param symbols An array containing all different symbols that occur in
+ * {@code data}.
+ * @return MTF and RLE encoded data.
+ */
+ private MTFAndRLEResult moveToFrontAndRunLengthEncode(final byte[] data, final int dataLen, final byte[] symbols)
+ {
+ // This array will contain the run length encoded result. The result
+ // will probably be shorter than data.length thanks to the run length
+ // encoding, but data.length (+ 1 for the EOB symbol) is the worst case
+ // length.
+ boolean[] seenSymbols = new boolean[259];
+ // RUNA and RUNB are always seen (even when they are not...)
+ seenSymbols[0] = true;
+ seenSymbols[1] = true;
+ int noSeenSymbols = 2;
+
+ // Initialize the move to front alphabet
+ final byte[] mtfAlphabet = m_scratchpad.m_mtfAlphabet;
+ System.arraycopy(INITIAL_MTF_ALPHABET, 0, mtfAlphabet, 0, mtfAlphabet.length);
+
+ // The array to store the encoded data in.
+ final int[] encodedData = m_scratchpad.m_encodedData;
+
+ // Create a mapping between a symbol and its index number in the array
+ // of symbols
+ final byte[] sequenceMap = createSequenceMap(symbols);
+
+ int lastSymbolIndex = 0;
+ int curOutArrayIndex = 0;
+ // A counter to keep track of the number of equal symbols in a row for
+ // the run length encoding
+ int noSame = 0;
+ for (int curInArrayIndex = 0; curInArrayIndex < dataLen; curInArrayIndex++)
+ {
+ final byte curSymbolIndex = sequenceMap[data[curInArrayIndex] & 0xFF];
+ if (curSymbolIndex == lastSymbolIndex)
+ {
+ noSame++;
+ }
+ else
+ {
+ if (noSame > 0)
+ {
+ // Run length encode
+ curOutArrayIndex += addRunaAndRunb(m_scratchpad.m_encodedData, curOutArrayIndex, noSame);
+ noSame = 0;
+ }
+
+ // Search for the current symbol in the MTF alphabet and count
+ // the distance
+ int j = 0;
+ byte lastMtf = mtfAlphabet[0];
+
+ while (mtfAlphabet[++j] != curSymbolIndex)
+ {
+ final byte nextLastMtf = mtfAlphabet[j];
+ mtfAlphabet[j] = lastMtf;
+ lastMtf = nextLastMtf;
+ }
+ // Swap the symbols in the MTF alphabet.
+ mtfAlphabet[j] = lastMtf;
+ mtfAlphabet[0] = curSymbolIndex;
+
+ // Output the distance. Distance 1 gets the value 2 since
+ // RUNA and RUNB have the values 0 and 1.
+ int symbolVal = j + 1;
+ encodedData[curOutArrayIndex++] = symbolVal;
+ if (!seenSymbols[symbolVal])
+ {
+ seenSymbols[symbolVal] = true;
+ noSeenSymbols++;
+ }
+ lastSymbolIndex = curSymbolIndex;
+ }
+ }
+ if (noSame > 0)
+ {
+ // One last run length encoding
+ curOutArrayIndex += addRunaAndRunb(encodedData, curOutArrayIndex, noSame);
+ }
+ return new MTFAndRLEResult(encodedData, curOutArrayIndex, noSeenSymbols);
+ }
+
+ private static class EncodeAllSegmentsResult
+ {
+ // The shortest encoded segment length for all segments.
+ private int m_shortestLength;
+ // The longest encoded segment length for all segments.
+ private int m_longestLength;
+ // A list with encoding results (the bit length) for each segment and
+ // tree.
+ private int[][] m_encodingResults;
+ // For each segment, the index of the tree that gave the shortest
+ // encoded block.
+ private int[] m_treesUsed;
+ }
+
+ /**
+ * Encode all 50-byte segments with all trees and count the encoded lengths.
+ * By doing this we can select the best Huffman tree for each segment by
+ * seeing which tree that gave the shortest encoded data.
+ * @param data The data to encode.
+ * @param dataLen The length of the data. (This may be shorter than the
+ * {@code data} array.)
+ * @param codeLengths An array of code lengths for each symbol for each
+ * investigated Huffman tree.
+ * @param numberOfHuffmanSegments The number of 50-byte segments in the
+ * current block.
+ * @param numberOfDifferentSymbols The number of different symbols in the
+ * data. This is the value of the EOB symbol + 1.
+ * @param res The result of the operation is stored in this object.
+ */
+ private void encodeAllSegmentsWithAllTrees(final int[] data, final int dataLen, final int[][] codeLengths, final int numberOfHuffmanSegments, final int numberOfDifferentSymbols, final EncodeAllSegmentsResult res) throws IOException
+ {
+ final int noTrees = codeLengths.length;
+ final int[][] encodingResults = m_scratchpad.m_encodingResults;
+ // The best tree for each segment
+ final int[] treesUsed = new int[numberOfHuffmanSegments];
+ // The shortest seen shortest length for all segments
+ int shortestLength = Integer.MAX_VALUE;
+ // The longest seen -shortest- length for all segments
+ int longestLength = 0;
+ for (int segmentNo = 0; segmentNo < numberOfHuffmanSegments; segmentNo++)
+ {
+ // Encode this segment with all Huffman trees
+ int shortestLengthForSegment = Integer.MAX_VALUE;
+ int bestTreeIndex = 0;
+ final int[] segmentEncodingResultPerTree = new int[noTrees];
+ final int segmentStart = segmentNo * NO_OF_SYMBOLS_PER_SEGMENT;
+ final int segmentEnd = Math.min(segmentStart + NO_OF_SYMBOLS_PER_SEGMENT, dataLen);
+ for (int treeNo = 0; treeNo < noTrees; treeNo++)
+ {
+ final int[] curTreeCodeLengths = codeLengths[treeNo];
+ int bitLen = 0;
+ for (int j = segmentStart; j < segmentEnd; j++)
+ {
+ bitLen += curTreeCodeLengths[data[j]];
+ }
+
+ if (treeNo == 0)
+ {
+ shortestLengthForSegment = bitLen;
+ }
+ else if (bitLen < shortestLengthForSegment)
+ {
+ shortestLengthForSegment = bitLen;
+ bestTreeIndex = treeNo;
+ }
+ segmentEncodingResultPerTree[treeNo] = bitLen;
+ }
+
+ if (segmentNo == 0)
+ {
+ shortestLength = longestLength = shortestLengthForSegment;
+ }
+ // Don't count the length of the last segment since that is likely
+ // to contain less than 50 symbols.
+ else if ((segmentNo < (numberOfHuffmanSegments - 1)) && (shortestLengthForSegment < shortestLength))
+ {
+ shortestLength = shortestLengthForSegment;
+ }
+ else if (shortestLengthForSegment > longestLength)
+ {
+ longestLength = shortestLengthForSegment;
+ }
+ encodingResults[segmentNo] = segmentEncodingResultPerTree;
+ treesUsed[segmentNo] = bestTreeIndex;
+ }
+
+ res.m_encodingResults = encodingResults;
+ res.m_longestLength = longestLength;
+ res.m_shortestLength = shortestLength;
+ res.m_treesUsed = treesUsed;
+ }
+
+ /**
+ * Divide all segments into x categories based on how well they were encoded
+ * by the globally optimal Huffman tree. An optimal Huffman tree is created
+ * for each category.
+ * @param data The data to encode.
+ * @param dataLen The length of the data.
+ * @param eobSymbol The value of the special EOB symbol. This is the highest
+ * used symbol value.
+ * @param numberOfHuffmanTrees The number of Huffman trees to create.
+ * @param numberOfSegments The number of 50-byte segments in the block.
+ * @param easr The encoding results from encoding the data with the globally
+ * optimal Huffman tree.
+ * @param globallyOptimalTree The symbol code lengths for the globally
+ * optimal Huffman tree.
+ * @return The symbols code lengths for each created tree.
+ */
+ private int[][] createNewTrees(final int[] data, final int dataLen, final int eobSymbol, final int numberOfHuffmanTrees, final int numberOfSegments, final EncodeAllSegmentsResult easr, final int[] globallyOptimalTree)
+ {
+ // Clear the frequencies array
+ final int[][] frequencies = m_scratchpad.m_frequencies2d;
+ for (int i = 0; i < numberOfHuffmanTrees; i++)
+ {
+ Arrays.fill(frequencies[i], 0);
+ }
+
+ // How big difference in number of bits is there between the shortest
+ // and the longest encoded segment?
+ final int maxDistance = easr.m_longestLength - easr.m_shortestLength;
+ if (maxDistance == 0)
+ {
+ // Nothing to do. We're as optimal as can be.
+ return new int[][] { globallyOptimalTree };
+ }
+
+ final int numberOfCategories = numberOfHuffmanTrees;
+ // Which category does each 50-byte segment fall into?
+ final int[] categoryPerSegment = m_scratchpad.m_categoriesPerSegment;
+ // How many 50-byte segments fall into each category?
+ final int[] noSegmentsPerCategory = new int[numberOfCategories];
+
+ // This array is used to determine which category a segment falls into
+ // based on its encoded length.
+ final int[] catArray = CATEGORY_PER_NO_OF_TREES_AND_PERCENTAGE[numberOfHuffmanTrees - 2];
+
+ // Don't include the last segment in the statistics since that is likely
+ // to be shorter
+ for (int i = 0; i < numberOfSegments - 1; i++)
+ {
+ // The shortest length for this segment.
+ final int segmentLen = easr.m_encodingResults[i][easr.m_treesUsed[i]];
+ final int percentage = (100 * (segmentLen - easr.m_shortestLength)) / maxDistance;
+ assert percentage >= 0;
+ assert percentage <= 100;
+ final int catNo = catArray[percentage];
+ noSegmentsPerCategory[catNo]++;
+ categoryPerSegment[i] = catNo;
+ }
+
+ for (int i = 0; i < numberOfSegments; i++)
+ {
+ final int segmentStart = i * NO_OF_SYMBOLS_PER_SEGMENT;
+ final int segmentEnd = Math.min(segmentStart + NO_OF_SYMBOLS_PER_SEGMENT, dataLen);
+ final int[] curCatFreqs = frequencies[categoryPerSegment[i]];
+ for (int j = segmentStart; j < segmentEnd; j++)
+ {
+ curCatFreqs[data[j]]++;
+ }
+ }
+
+ int noNewTrees = 0;
+ for (int i = 0; i < numberOfCategories; i++)
+ {
+ if (noSegmentsPerCategory[i] > 0)
+ {
+ // Create a new Huffman tree for this category.
+ noNewTrees++;
+ }
+ }
+ assert noNewTrees > 0;
+
+ int[][] res = new int[noNewTrees][];
+ int treeNo = 0;
+ for (int i = 0; i < numberOfCategories; i++)
+ {
+ if (noSegmentsPerCategory[i] > 0)
+ {
+ res[treeNo++] = HighValueBranchHuffmanTree.createCodeLengths(frequencies[i], eobSymbol + 1, MAX_HUFFMAN_BIT_LENGTH, m_scratchpad);
+ }
+ }
+ return res;
+ }
+
+ /**
+ * Refine the Huffman trees based on the encoding results. For each tree,
+ * make it optimal based on the data in the segments that it was the best
+ * tree for.
+ * @param data The data to encode.
+ * @param dataLen The length of the data to encode.
+ * @param codeLengths The code length for each symbol for each tree.
+ * @param easr The results when encoding the data with this set of trees.
+ * @param eobSymbol The value of the EOB symbol. This is the highest symbol
+ * value.
+ * @return Symbol code lengths for the refined trees.
+ */
+ private int[][] refineTreesBasedOnEncodingResults(final int[] data, final int dataLen, final int[][] codeLengths, final EncodeAllSegmentsResult easr, final int eobSymbol)
+ {
+ // Clear the frequencies array
+ final int[][] frequencies = m_scratchpad.m_frequencies2d;
+ for (int i = 0; i < codeLengths.length; i++)
+ {
+ Arrays.fill(frequencies[i], 0);
+ }
+
+ int segmentNo = 0;
+ int noInSegment = 0;
+ int curTree = easr.m_treesUsed[segmentNo];
+ for (int i = 0; i < dataLen; i++)
+ {
+ int symbolVal = data[i];
+ frequencies[curTree][symbolVal]++;
+ if (++noInSegment == NO_OF_SYMBOLS_PER_SEGMENT)
+ {
+ segmentNo++;
+ // If the data length is a multiple of 50, we do a switch after
+ // encoding the last symbol which will make segmentNo greater
+ // than the index of the last element in easr.m_treesUsed.
+ // Thus the check below.
+ if (segmentNo < easr.m_treesUsed.length)
+ {
+ curTree = easr.m_treesUsed[segmentNo];
+ }
+ noInSegment = 0;
+ }
+ }
+
+ // Recreate the trees based on the gathered frequencies
+ int[][] res = new int[codeLengths.length][];
+ for (int i = 0; i < codeLengths.length; i++)
+ {
+ res[i] = HighValueBranchHuffmanTree.createCodeLengths(frequencies[i], eobSymbol + 1, MAX_HUFFMAN_BIT_LENGTH, m_scratchpad);
+ }
+ return res;
+ }
+
+ /**
+ * Get the number of Huffman trees to use based on the number of 50-byte
+ * segments in the data.
+ */
+ private byte getNumberOfHuffmanTrees(int noSegments)
+ {
+ // Values from bzip2
+ if (noSegments < 200)
+ {
+ return 2;
+ }
+ else if (noSegments < 600)
+ {
+ return 3;
+ }
+ else if (noSegments < 1200)
+ {
+ return 4;
+ }
+ else if (noSegments < 2400)
+ {
+ return 5;
+ }
+ else
+ {
+ return 6;
+ }
+ }
+
+ /**
+ * Get the minimum and maximum code length from the array.
+ * @return An int array containing the minimum and the maximum code lengths,
+ * in that order.
+ */
+ private int[] getMinAndMaxCodeLengths(final int[] codeLengths)
+ {
+ int minLength = codeLengths[0];
+ int maxLength = codeLengths[0];
+ for (int i = 1; i < codeLengths.length; i++)
+ {
+ if (codeLengths[i] < minLength)
+ {
+ minLength = codeLengths[i];
+ }
+ else if (codeLengths[i] > maxLength)
+ {
+ maxLength = codeLengths[i];
+ }
+ }
+ return new int[] { minLength, maxLength };
+ }
+
+ /**
+ * Create the Huffman trees that should be used for encoding the current
+ * block. First, an globally optimal tree is created. Then new trees are
+ * created from information on how well the globally optimal tree encoded
+ * different segments. Lastly, the created trees are optimized based on the
+ * data in the segments that they are used to encode. This last step is
+ * repeated a configurable number of times ({@code
+ * m_numberOfHuffmanTreeRefinementIterations}).
+ * @param data The data that should be encoded using the created Huffman
+ * trees.
+ * @param dataLen The length of the data, excluding the trailing EOB symbol.
+ * @param noSymbolsUsed The number of different symbols used in the data.
+ */
+ private HuffmanTreesAndUsage createHuffmanTrees(final int[] data, final int dataLen, final int noSymbolsUsed) throws IOException
+ {
+ HuffmanTreesAndUsage res = new HuffmanTreesAndUsage();
+
+ // The maximum possible number of trees.
+ // +1 == EOB symbol
+ res.m_noHuffmanSegments = ((dataLen - 1 + 1) / NO_OF_SYMBOLS_PER_SEGMENT) + 1;
+
+ // Create a Huffman tree for the entire input.
+ // Count the frequencies of the different bytes in the input.
+ int[] frequencies = m_scratchpad.m_frequencies;
+ Arrays.fill(frequencies, 0);
+
+ // The maximum symbol value used (before the EOB symbol) is at least 1
+ // (RUNB).
+ int maxSymbolValue = 1;
+ for (int j = 0; j < dataLen; j++)
+ {
+ int symbolVal = data[j];
+ frequencies[symbolVal]++;
+ if (symbolVal > maxSymbolValue)
+ {
+ maxSymbolValue = symbolVal;
+ }
+ }
+
+ // Now we can infer the value of the EOB (End Of Block) symbol. Add it
+ // to the end of the data. The data array is created so there should be
+ // room for it.
+ res.m_eobSymbol = maxSymbolValue + 1;
+ frequencies[res.m_eobSymbol] = 1;
+ data[dataLen] = res.m_eobSymbol;
+ final int dataLenIncEob = dataLen + 1;
+
+ // Maybe we're already done?
+ if (res.m_noHuffmanSegments < MIN_NO_OF_HUFFMAN_TREES)
+ {
+ // We have to encode at least two trees anyway.
+ res.m_trees = new HighValueBranchHuffmanTree[MIN_NO_OF_HUFFMAN_TREES];
+ int[] codeLengths = HighValueBranchHuffmanTree.createCodeLengths(frequencies, res.m_eobSymbol + 1, MAX_HUFFMAN_BIT_LENGTH, m_scratchpad);
+ int[] minAndMaxLength = getMinAndMaxCodeLengths(codeLengths);
+ HighValueBranchHuffmanTree tree = new HighValueBranchHuffmanTree(codeLengths, minAndMaxLength[0], minAndMaxLength[1], true);
+ for (int i = 0; i < MIN_NO_OF_HUFFMAN_TREES; i++)
+ {
+ res.m_trees[i] = tree;
+ }
+ // Use tree #0 for all segments
+ res.m_treeUsage = new int[res.m_noHuffmanSegments];
+ }
+ else
+ {
+ final int[][][] huffmanCodeLengths = new int[m_numberOfHuffmanTreeRefinementIterations + 1][][];
+ final int[] codeLengthsForGloballyOptimalTree = HighValueBranchHuffmanTree.createCodeLengths(frequencies, res.m_eobSymbol + 1, MAX_HUFFMAN_BIT_LENGTH, m_scratchpad);
+ final EncodeAllSegmentsResult easr = new EncodeAllSegmentsResult();
+ encodeAllSegmentsWithAllTrees(data, dataLen, new int[][] { codeLengthsForGloballyOptimalTree }, res.m_noHuffmanSegments, res.m_eobSymbol + 1, easr);
+ huffmanCodeLengths[0] = createNewTrees(data, dataLen, res.m_eobSymbol, getNumberOfHuffmanTrees(res.m_noHuffmanSegments), res.m_noHuffmanSegments, easr, codeLengthsForGloballyOptimalTree);
+
+ // Select the set of trees that gives the shortest total data length
+ int bestIndex = -1;
+ int bestLength = Integer.MAX_VALUE;
+ int[] bestTreeUsage = null;
+ for (int i = 0; i < huffmanCodeLengths.length; i++)
+ {
+ if (i > 0)
+ {
+ // Refine the trees
+ huffmanCodeLengths[i] = refineTreesBasedOnEncodingResults(data, dataLenIncEob, huffmanCodeLengths[i - 1], easr, res.m_eobSymbol);
+ }
+ encodeAllSegmentsWithAllTrees(data, dataLenIncEob, huffmanCodeLengths[i], res.m_noHuffmanSegments, res.m_eobSymbol + 1, easr);
+
+ int totLen = 0;
+ for (int j = 0; j < easr.m_treesUsed.length; j++)
+ {
+ totLen += easr.m_encodingResults[j][easr.m_treesUsed[j]];
+ }
+
+ // Previously the length of each encoded tree was added to the
+ // total length. That had negligible effect on the total encoded
+ // length and a small impact on the performance.
+ if (totLen < bestLength)
+ {
+ bestIndex = i;
+ bestLength = totLen;
+ bestTreeUsage = easr.m_treesUsed;
+ }
+ }
+
+ int noTrees = huffmanCodeLengths[bestIndex].length;
+ if (noTrees < MIN_NO_OF_HUFFMAN_TREES)
+ {
+ res.m_trees = new HighValueBranchHuffmanTree[MIN_NO_OF_HUFFMAN_TREES];
+ int[] minAndMaxLength = getMinAndMaxCodeLengths(huffmanCodeLengths[bestIndex][0]);
+ for (int i = 0; i < MIN_NO_OF_HUFFMAN_TREES; i++)
+ {
+ res.m_trees[i] = new HighValueBranchHuffmanTree(huffmanCodeLengths[bestIndex][0], minAndMaxLength[0], minAndMaxLength[1], true);
+ }
+ }
+ else
+ {
+ res.m_trees = new HighValueBranchHuffmanTree[huffmanCodeLengths[bestIndex].length];
+ for (int i = 0; i < huffmanCodeLengths[bestIndex].length; i++)
+ {
+ int[] minAndMaxLengths = getMinAndMaxCodeLengths(huffmanCodeLengths[bestIndex][i]);
+ res.m_trees[i] = new HighValueBranchHuffmanTree(huffmanCodeLengths[bestIndex][i], minAndMaxLengths[0], minAndMaxLengths[1], true);
+ }
+ }
+ res.m_treeUsage = bestTreeUsage;
+ }
+ return res;
+ }
+
+ /**
+ * Encode the Huffman tree and write it to the output.
+ * @param tree The tree to encode.
+ * @param numberOfDifferentSymbols The number of different symbols in the
+ * tree.
+ * @param out The output to write the tree to.
+ */
+ static void encodeHuffmanTree(final HighValueBranchHuffmanTree tree, final int numberOfDifferentSymbols, final BitOutput out) throws IOException
+ {
+ // Huffman bit length for the first symbol (0..17)
+ int len = tree.getBitLength(0);
+ out.writeBitsLittleEndian(len, 5);
+ // Encode a delta length compared to the previous length for each
+ // symbol.
+ for (int j = 0; j < numberOfDifferentSymbols; j++)
+ {
+ int prevLen = len;
+ len = tree.getBitLength(j);
+ while (len != prevLen)
+ {
+ // Alter length
+ out.writeBit(true);
+ if (prevLen < len)
+ {
+ // Make longer
+ out.writeBit(false);
+ prevLen++;
+ }
+ else
+ {
+ // Make shorter
+ out.writeBit(true);
+ prevLen--;
+ }
+ }
+ // We are at the right length
+ out.writeBit(false);
+ }
+ }
+
+ /**
+ * Write the block header for an encoded data block.
+ * @param blockChecksum The block checksum.
+ * @param bwFirstPointer The pointer to the first element in the Burrows
+ * Wheeler encoded data.
+ * @param seenDifferentBytes Bit flags that are switched on for all bytes
+ * that are seen in the written data.
+ * @param mtfrle Results from the MTF and RLE encodings.
+ * @param htau The different Huffman trees and information on when they are
+ * used.
+ */
+ private void writeBlockHeader(final int blockChecksum, int bwFirstPointer, boolean[] seenDifferentBytes, MTFAndRLEResult mtfrle, HuffmanTreesAndUsage htau) throws IOException
+ {
+ // Block magic
+ for (int i = 0; i < BLOCK_MAGIC.length; i++)
+ {
+ m_out.writeBitsLittleEndian(BLOCK_MAGIC[i] & 0xFF, 8);
+ }
+ // Checksum
+ m_out.writeBitsLittleEndian(blockChecksum, 32);
+ // Randomized? (no)
+ m_out.writeBit(false);
+ // Starting pointer into Burrows Wheeler matrix (24 bits)
+ m_out.writeBitsLittleEndian(bwFirstPointer, 24);
+
+ boolean[] segmentsWithData = new boolean[16];
+ boolean[][] seenData = new boolean[16][16];
+ for (int i = 0; i < 256; i++)
+ {
+ if (seenDifferentBytes[i])
+ {
+ segmentsWithData[i / 16] = true;
+ seenData[i / 16][i % 16] = true;
+ }
+ }
+
+ // Write a flag for each block of 16 bytes that have at least one byte
+ // occurring in the encoded data.
+ for (int i = 0; i < 16; i++)
+ {
+ m_out.writeBit(segmentsWithData[i]);
+ }
+ // For each block used, write a flag for each of the used bytes in that
+ // block.
+ for (int i = 0; i < 16; i++)
+ {
+ if (segmentsWithData[i])
+ {
+ for (int j = 0; j < 16; j++)
+ {
+ m_out.writeBit(seenData[i][j]);
+ }
+ }
+ }
+
+ // The number of Huffman trees used (2..6)
+ m_out.writeBits(htau.m_trees.length, 3);
+
+ // The number of times the Huffman trees are switched (each 50 bytes)
+ m_out.writeBitsLittleEndian(htau.m_noHuffmanSegments, 15);
+
+ // Which Huffman tree is selected at each switch? Use a zero-terminated
+ // bit run of MTF:ed index values
+
+ // Init the MTF alphabet
+ int[] mtfAlpha = new int[htau.m_trees.length];
+ for (int i = 0; i < htau.m_trees.length; i++)
+ {
+ mtfAlpha[i] = i;
+ }
+ int[] treeUsageMtf = new int[htau.m_noHuffmanSegments];
+ new IntMoveToFront(mtfAlpha).encode(htau.m_treeUsage, treeUsageMtf);
+
+ for (int i = 0; i < htau.m_noHuffmanSegments; i++)
+ {
+ // A zero-terminated bit run for the values 0..5
+ int val = 0;
+ while (val < treeUsageMtf[i])
+ {
+ m_out.writeBit(true);
+ val++;
+ }
+ m_out.writeBit(false);
+ }
+
+ // Encode each Huffman tree
+ for (int i = 0; i < htau.m_trees.length; i++)
+ {
+ encodeHuffmanTree(htau.m_trees[i], htau.m_eobSymbol + 1, m_out);
+ }
+ }
+
+ private static class HuffmanTreesAndUsage
+ {
+ private HighValueBranchHuffmanTree[] m_trees;
+ private int m_noHuffmanSegments;
+ private int[] m_treeUsage;
+ private int m_eobSymbol;
+ }
+
+ void encode() throws IOException
+ {
+ // Fix the block overshoot. Copy DATA_OVERSHOOT bytes to the end of the
+ // array. Repeat the data if the block is shorter than DATA_OVERSHOOT
+ // bytes.
+ int noCopied = 0;
+ while (noCopied < ThreeWayRadixQuicksort.DATA_OVERSHOOT)
+ {
+ int noToCopy = Math.min(ThreeWayRadixQuicksort.DATA_OVERSHOOT - noCopied, m_blockSize);
+ System.arraycopy(m_block, 0, m_block, m_blockSize + noCopied, noToCopy);
+ noCopied += noToCopy;
+ }
+
+ // Sort the data in the block.
+ // data contains the written data after the initial move to front
+ // transformation
+ BurrowsWheelerEncodingResult burrWhee = new BurrowsWheelerEncoder(m_block, m_blockSize, m_scratchpad).encode();
+
+ // Run Move to front and run length encoding transformations on the
+ // Burrows Wheeler encoded data
+ MTFAndRLEResult rleMtfSymbols = moveToFrontAndRunLengthEncode(burrWhee.m_lastColumn, m_blockSize, getSeenByteValues());
+ int[] encodedData = rleMtfSymbols.m_encodedData;
+
+ // Create the Huffman trees. This method also infers the value of the
+ // EOB symbol and adds it to the end of the encodedData array.
+ HuffmanTreesAndUsage htau = createHuffmanTrees(rleMtfSymbols.m_encodedData, rleMtfSymbols.m_dataLen, rleMtfSymbols.m_noSeenDifferentSymbols);
+
+ writeBlockHeader(m_blockChecksum, burrWhee.m_firstPointer, m_seenDifferentBytes, rleMtfSymbols, htau);
+
+ // Write the Huffman encoded data. The EOB symbol is last in the data.
+ int swapNo = 0;
+ int noLeftUntilSwap = 1;
+ HighValueBranchHuffmanTree curTree = null;
+ // +1 == EOB symbol
+ for (int i = 0; i < rleMtfSymbols.m_dataLen + 1; i++)
+ {
+ if (--noLeftUntilSwap == 0)
+ {
+ curTree = htau.m_trees[htau.m_treeUsage[swapNo++]];
+ noLeftUntilSwap = NO_OF_SYMBOLS_PER_SEGMENT;
+ }
+ curTree.write(m_out, encodedData[i]);
+ }
+ assert swapNo == htau.m_noHuffmanSegments;
+
+ if (m_blockEncoderCallback != null)
+ {
+ m_blockEncoderCallback.reportBlockDone();
+ }
+ }
+}
diff --git a/src/main/java/org/at4j/comp/bzip2/BlockEncoderRunnable.java b/src/main/java/org/at4j/comp/bzip2/BlockEncoderRunnable.java
new file mode 100644
index 0000000..2488594
--- /dev/null
+++ b/src/main/java/org/at4j/comp/bzip2/BlockEncoderRunnable.java
@@ -0,0 +1,62 @@
+/* AT4J -- Archive file tools for Java -- http://www.at4j.org
+ * Copyright (C) 2009 Karl Gustafsson
+ *
+ * This file is a part of AT4J
+ *
+ * AT4J is free software: you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+package org.at4j.comp.bzip2;
+
+import java.io.IOException;
+
+/**
+ * This is used by the {@link BlockOutputStream} to encode a block in a separate
+ * encoding thread. It uses a {@link BlockEncoder} to do the actual encoding.
+ * @author Karl Gustafsson
+ * @since 1.1
+ */
+final class BlockEncoderRunnable implements Runnable
+{
+ private final BlockEncoder m_encoder;
+ private final Object m_errorOwner;
+
+ BlockEncoderRunnable(final BlockEncoder be, final Object errorOwner)
+ {
+ m_encoder = be;
+ m_errorOwner = errorOwner;
+ }
+
+ public void run()
+ {
+ try
+ {
+ m_encoder.setScratchpad(((EncodingThread) Thread.currentThread()).getScratchpad());
+ m_encoder.encode();
+ }
+ catch (IOException e)
+ {
+
+ ((EncodingThread) Thread.currentThread()).getErrorState().registerError(e, m_errorOwner);
+ }
+ catch (RuntimeException e)
+ {
+ ((EncodingThread) Thread.currentThread()).getErrorState().registerError(e, m_errorOwner);
+ }
+ catch (Error e)
+ {
+
+ ((EncodingThread) Thread.currentThread()).getErrorState().registerError(e, m_errorOwner);
+ }
+ }
+}
diff --git a/src/main/java/org/at4j/comp/bzip2/BlockOutputStream.java b/src/main/java/org/at4j/comp/bzip2/BlockOutputStream.java
new file mode 100644
index 0000000..5837bf2
--- /dev/null
+++ b/src/main/java/org/at4j/comp/bzip2/BlockOutputStream.java
@@ -0,0 +1,355 @@
+/* AT4J -- Archive file tools for Java -- http://www.at4j.org
+ * Copyright (C) 2009 Karl Gustafsson
+ *
+ * This file is a part of AT4J
+ *
+ * AT4J is free software: you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+package org.at4j.comp.bzip2;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.Arrays;
+
+import org.at4j.support.io.BitOutput;
+import org.at4j.support.io.LittleEndianBitOutputStream;
+
+/**
+ * Used by {@link BZip2OutputStream} to RLE encode data and then write it to
+ * compressed blocks.
+ * @author Karl Gustafsson
+ * @since 1.1
+ */
+final class BlockOutputStream extends OutputStream
+{
+ /**
+ * The different states of the run length encoder.
+ */
+ private static enum RLEState
+ {
+ ENCODING_SINGLE, COUNTING_MULTIPLE;
+ }
+
+ // The maximum number of encoded repeated bytes.
+ private static final int MAX_NO_OF_RLE_REPEATS = 251;
+
+ // The state of the run length encoder.
+ private RLEState m_rleState;
+ // The last byte value that write was called with. Used to keep track of
+ // the run length encoding.
+ private int m_last = -1;
+ // How many equal bytes in a row has write been called with. Used to keep
+ // track of the run length encoding.
+ private int m_numberOfSame;
+ // Encoded data is written to this.
+ private final BitOutput m_wrapped;
+ // The size of a Burrows Wheeler block, in bytes.
+ private final int m_blockSize;
+ // How many times should the Huffman trees be refined before encoding data?
+ private final int m_numberOfHuffmanTreeRefinementIterations;
+ // Bit flags indicating which bytes that occur at least once in the current
+ // block.
+ private boolean[] m_seenDifferentBytesInCurBlock;
+ // The data in the current block.
+ private byte[] m_block;
+ // If we are using separate encoding threads, this executor is used to
+ // schedule blocks for execution. Otherwise it is null.
+ private final BZip2EncoderExecutorServiceImpl m_encodingExecutor;
+ // A token identifying who owns the errors that may be caused by jobs that
+ // we might schedule in the executor. This is null if no executor is used.
+ private final Object m_errorOwner;
+
+ // Contains preallocated data structures. Used to reduce the number of
+ // temporary objects that are created and thus avoid time spent gc:ing.
+ // This is null if an executor is used for encoding.
+ private final EncodingScratchpad m_scratchpad;
+
+ // If we use several encoder threads, this object is used for writing the
+ // encoded blocks in the right order. Otherwise it is null.
+ private final EncodedBlockWriter m_encodedBlockWriter;
+
+ // The checksum for the current block.
+ private CRC m_blockChecksum;
+ // The checksum for the entire file.
+ private int m_fileChecksum = 0;
+
+ // The number of different bytes seen in the current block.
+ private int m_noSeenDifferentBytesInCurBlock;
+ private int m_blockPointer;
+
+ private int m_blockNo = 0;
+
+ BlockOutputStream(BitOutput wrapped, int blockSize, int numberOfHuffmanTreeRefinementIterations, BZip2EncoderExecutorServiceImpl ex, Object errorOwner, EncodedBlockWriter ebw, EncodingScratchpad sp)
+ {
+ // Can only have one, not both.
+ assert ex == null ^ sp == null;
+
+ m_wrapped = wrapped;
+ m_blockSize = blockSize;
+ m_numberOfHuffmanTreeRefinementIterations = numberOfHuffmanTreeRefinementIterations;
+ m_blockChecksum = new CRC();
+ m_scratchpad = sp;
+ // May be null.
+ m_encodingExecutor = ex;
+ // May be null
+ m_errorOwner = errorOwner;
+ // May be null.
+ m_encodedBlockWriter = ebw;
+
+ startNewBlock();
+ }
+
+ private void startNewBlock()
+ {
+ m_blockPointer = 0;
+
+ if (m_encodingExecutor != null)
+ {
+ // We use several threads for encoding. Create new instances for
+ // data that may be used right now by an encoder.
+ m_seenDifferentBytesInCurBlock = new boolean[256];
+ m_block = new byte[m_blockSize + ThreeWayRadixQuicksort.DATA_OVERSHOOT];
+ }
+ else
+ {
+ // We encode in this thread. It is safe to reuse variables.
+ if (m_seenDifferentBytesInCurBlock == null)
+ {
+ m_seenDifferentBytesInCurBlock = new boolean[256];
+ }
+ else
+ {
+ Arrays.fill(m_seenDifferentBytesInCurBlock, false);
+ }
+
+ if (m_block == null)
+ {
+ m_block = new byte[m_blockSize + ThreeWayRadixQuicksort.DATA_OVERSHOOT];
+ }
+ }
+ m_noSeenDifferentBytesInCurBlock = 0;
+
+ // Reset the run length encoder state
+ m_last = -1;
+ m_numberOfSame = 0;
+ m_rleState = RLEState.ENCODING_SINGLE;
+ }
+
+ private boolean isFull()
+ {
+ return m_blockPointer == m_blockSize;
+ }
+
+ private boolean isEmpty()
+ {
+ return m_blockPointer == 0;
+ }
+
+ int getFileChecksum()
+ {
+ return m_fileChecksum;
+ }
+
+ /**
+ * Write a compressed data block.
+ */
+ private void writeCurBlock() throws IOException
+ {
+ final int blockChecksum = m_blockChecksum.getValue();
+ m_blockChecksum = new CRC();
+ if (m_encodingExecutor == null)
+ {
+ // Encode the block in the current thread.
+ BlockEncoder be = new BlockEncoder(m_block, m_blockNo, m_blockPointer, blockChecksum, m_seenDifferentBytesInCurBlock, m_noSeenDifferentBytesInCurBlock, m_numberOfHuffmanTreeRefinementIterations, m_wrapped, null);
+ be.setScratchpad(m_scratchpad);
+ be.encode();
+ }
+ else
+ {
+ // Hand off the block to another thread for encoding.
+
+ // Allocate an output buffer that is 2/3rds of the size of the
+ // written data.
+ ByteArrayOutputStream baos = new ByteArrayOutputStream((2 * m_blockPointer) / 3);
+ BitOutput out = new LittleEndianBitOutputStream(baos);
+ BlockEncodedCallback bec = new BlockEncodedCallback(m_blockNo, baos, out, m_encodedBlockWriter);
+ BlockEncoder be = new BlockEncoder(m_block, m_blockNo, m_blockPointer, blockChecksum, m_seenDifferentBytesInCurBlock, m_noSeenDifferentBytesInCurBlock, m_numberOfHuffmanTreeRefinementIterations, out, bec);
+ m_encodingExecutor.execute(new BlockEncoderRunnable(be, m_errorOwner));
+ }
+
+ // Update the file checksum
+ m_fileChecksum = (m_fileChecksum << 1) | (m_fileChecksum >>> 31);
+ m_fileChecksum ^= blockChecksum;
+
+ m_blockNo++;
+ }
+
+ /**
+ * Write a single byte.
+ */
+ private void writeByte(final int b) throws IOException
+ {
+ m_block[m_blockPointer++] = (byte) (b & 0xFF);
+ if (!m_seenDifferentBytesInCurBlock[b])
+ {
+ m_seenDifferentBytesInCurBlock[b] = true;
+ m_noSeenDifferentBytesInCurBlock++;
+ }
+
+ if (isFull())
+ {
+ // File f = new File("/tmp/block_" + ++m_blockNo + ".dat");
+ // OutputStream os = new BufferedOutputStream(new FileOutputStream(f));
+ // try
+ // {
+ // os.write(m_block, 0, m_blockPointer);
+ // }
+ // finally
+ // {
+ // os.close();
+ // }
+
+ writeCurBlock();
+ startNewBlock();
+ }
+ }
+
+ @Override
+ public void write(final int b) throws IOException
+ {
+ // Run length encode
+ switch (m_rleState)
+ {
+ case ENCODING_SINGLE:
+ if (b == m_last)
+ {
+ m_numberOfSame++;
+ if (m_numberOfSame == 4)
+ {
+ if (m_blockPointer == m_blockSize - 1)
+ {
+ // Corner case. bzip2 cannot handle blocks that end
+ // with four equal bytes. End this block one byte
+ // earlier.
+ writeCurBlock();
+ startNewBlock();
+ write(b);
+ return;
+ }
+ else
+ {
+ // Four equal in a row. Change state
+ m_rleState = RLEState.COUNTING_MULTIPLE;
+ m_numberOfSame = 0;
+ }
+ }
+ }
+ else
+ {
+ m_last = b;
+ m_numberOfSame = 1;
+ }
+ m_blockChecksum.update(b);
+ writeByte(b);
+ break;
+
+ case COUNTING_MULTIPLE:
+ if (b == m_last)
+ {
+ m_numberOfSame++;
+ if (m_numberOfSame == MAX_NO_OF_RLE_REPEATS)
+ {
+ // Cannot repeat this anymore. Update checksum, write
+ // and switch state.
+ for (int i = 0; i < MAX_NO_OF_RLE_REPEATS; i++)
+ {
+ m_blockChecksum.update(b);
+ }
+ writeByte(MAX_NO_OF_RLE_REPEATS);
+ m_rleState = RLEState.ENCODING_SINGLE;
+ m_numberOfSame = 0;
+ }
+ }
+ else
+ {
+ // A byte that is not same as the last. Stop counting,
+ // update the checksum and change state.
+ for (int i = 0; i < m_numberOfSame; i++)
+ {
+ m_blockChecksum.update(m_last);
+ }
+ writeByte(m_numberOfSame);
+ m_blockChecksum.update(b);
+ writeByte(b);
+ m_numberOfSame = 1;
+ m_last = b;
+ m_rleState = RLEState.ENCODING_SINGLE;
+ }
+ break;
+
+ default:
+ throw new RuntimeException("Unknown encoding state " + m_rleState + ". This is a bug");
+ }
+ }
+
+ @Override
+ public void write(final byte[] data) throws IOException
+ {
+ for (int i = 0; i < data.length; i++)
+ {
+ write(data[i] & 0xFF);
+ }
+ }
+
+ @Override
+ public void write(final byte[] data, final int offset, final int len) throws IOException
+ {
+ // Range validation is done by BZip2OutputStream
+ for (int i = offset; i < offset + len; i++)
+ {
+ write(data[i] & 0xFF);
+ }
+ }
+
+ @Override
+ public void close() throws IOException
+ {
+ if (m_rleState == RLEState.COUNTING_MULTIPLE)
+ {
+ // Update the checksum and write the current count.
+ for (int i = 0; i < m_numberOfSame; i++)
+ {
+ m_blockChecksum.update(m_last & 0xFF);
+ }
+ writeByte(m_numberOfSame);
+ }
+
+ if (!isEmpty())
+ {
+ writeCurBlock();
+ }
+
+ if (m_encodedBlockWriter != null)
+ {
+ // Tell the encoded block writer that we're done.
+ m_encodedBlockWriter.writeBlock(m_blockNo, null);
+ }
+
+ // Don't close the wrapped BitOutput. It will be used later on to write
+ // the EOF block.
+
+ super.close();
+ }
+}
diff --git a/src/main/java/org/at4j/comp/bzip2/BurrowsWheelerDecoder.java b/src/main/java/org/at4j/comp/bzip2/BurrowsWheelerDecoder.java
new file mode 100644
index 0000000..a9339bd
--- /dev/null
+++ b/src/main/java/org/at4j/comp/bzip2/BurrowsWheelerDecoder.java
@@ -0,0 +1,120 @@
+/* AT4J -- Archive file tools for Java -- http://www.at4j.org
+ * Copyright (C) 2009 Karl Gustafsson
+ *
+ * This file is a part of AT4J
+ *
+ * AT4J is free software: you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+package org.at4j.comp.bzip2;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Decode Burrows Wheeler encoded data.
+ * @author Karl Gustafsson
+ * @since 1.1
+ */
+final class BurrowsWheelerDecoder
+{
+ static class BWInputStream extends InputStream
+ {
+ private final byte[] m_decoded;
+ private final int[] m_ptr;
+
+ private int m_curPointer;
+ private boolean m_eof;
+ private int m_noLeftToRead;
+
+ BWInputStream(byte[] decoded, int[] ptr, int originalDataPointer)
+ {
+ m_decoded = decoded;
+ m_ptr = ptr;
+ m_curPointer = ptr[originalDataPointer];
+ m_noLeftToRead = ptr.length;
+ }
+
+ @Override
+ public int read() throws IOException
+ {
+ if (m_eof)
+ {
+ return -1;
+ }
+ final int res = m_decoded[m_curPointer] & 0xFF;
+ m_eof = --m_noLeftToRead == 0;
+ m_curPointer = m_ptr[m_curPointer];
+ return res;
+ }
+ }
+
+ private final byte[] m_decoded;
+ private final int m_noBytesDecoded;
+ private final int[] m_byteFrequencies;
+ private final int m_originalDataPointer;
+
+ /**
+ * @param encoded The encoded data. This array may be longer than the actual
+ * amount of encoded data. The {@code noBytesDecoded} parameter determines
+ * how much of the array that will be used.
+ * @param noBytesEncoded The length of the encoded data.
+ * @param byteFrequencies The number of times each byte occur in the data.
+ * @param originalDataPointer The row number of the original data in the
+ * Burrows Wheeler matrix.
+ * @throws IOException On I/O errors.
+ */
+ BurrowsWheelerDecoder(byte[] encoded, int noBytesEncoded, int[] byteFrequencies, int originalDataPointer) throws IOException
+ {
+ if (originalDataPointer > noBytesEncoded)
+ {
+ throw new IOException("Invalid pointer to original data in block header " + originalDataPointer + ". It is larger than the size of data in the block " + noBytesEncoded);
+ }
+
+ m_decoded = encoded;
+ m_noBytesDecoded = noBytesEncoded;
+ m_byteFrequencies = byteFrequencies;
+ m_originalDataPointer = originalDataPointer;
+ }
+
+ InputStream decode()
+ {
+ // Calculate the transformation vector used to move from the encoded
+ // data to the decoded.
+
+ // The byte frequency array contains the frequency of each byte in the
+ // data. Create a new array tarr that, for each byte, specifies how many
+ // bytes of lower value that occurs in the data.
+ int[] tarr = new int[256];
+ tarr[0] = 0;
+ for (int i = 1; i < 256; i++)
+ {
+ tarr[i] = tarr[i - 1] + m_byteFrequencies[i - 1];
+ }
+
+ // The ptr array will contain a chain of positions of the decoded bytes
+ // in the decoded array.
+ final int[] ptr = new int[m_noBytesDecoded];
+ for (int i = 0; i < m_noBytesDecoded; i++)
+ {
+ int val = m_decoded[i] & 0xFF;
+ // Get the position of the decoded byte position in tt. Increment
+ // the tt position for the given value so that next occurrence of the
+ // value will end up in the next position in tt.
+ int ttPos = tarr[val]++;
+ ptr[ttPos] = i;
+ }
+
+ return new BWInputStream(m_decoded, ptr, m_originalDataPointer);
+ }
+}
diff --git a/src/main/java/org/at4j/comp/bzip2/BurrowsWheelerEncoder.java b/src/main/java/org/at4j/comp/bzip2/BurrowsWheelerEncoder.java
new file mode 100644
index 0000000..83063d4
--- /dev/null
+++ b/src/main/java/org/at4j/comp/bzip2/BurrowsWheelerEncoder.java
@@ -0,0 +1,99 @@
+/* AT4J -- Archive file tools for Java -- http://www.at4j.org
+ * Copyright (C) 2009 Karl Gustafsson
+ *
+ * This file is a part of AT4J
+ *
+ * AT4J is free software: you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+package org.at4j.comp.bzip2;
+
+/**
+ * Burrows Wheeler encoder.
+ * @author Karl Gustafsson
+ * @since 1.1
+ */
+final class BurrowsWheelerEncoder
+{
+ static class BurrowsWheelerEncodingResult
+ {
+ // The values of the last column of the matrix
+ final byte[] m_lastColumn;
+ // The row number of the first row (the row which contains the incoming
+ // data) in the sorted matrix
+ final int m_firstPointer;
+
+ private BurrowsWheelerEncodingResult(byte[] lastColumn, int firstPointer)
+ {
+ m_lastColumn = lastColumn;
+ m_firstPointer = firstPointer;
+ }
+ }
+
+ // The shortest length that will be quicksorted rather than shell sorted
+ private static int MIN_QUICKSORT_LENGTH = 18;
+
+ // The data array containing the unencoded data.
+ private final byte[] m_data;
+ // The length of the data in the array. Data occupies the positions 0 to
+ // m_length - 1 in the array.
+ private final int m_length;
+ // Contains preallocated data structures. Used to reduce the number of
+ // temporary objects that are created and thus avoid time spent gc:ing.
+ private final EncodingScratchpad m_scratchpad;
+
+ /**
+ * @param data This array should contain a 100 byte overshoot. See
+ * {@link ThreeWayRadixQuicksort#ThreeWayRadixQuicksort(byte[], int, int, EncodingScratchpad)}
+ * .
+ */
+ BurrowsWheelerEncoder(byte[] data, int length, EncodingScratchpad sp)
+ {
+ if (length > data.length)
+ {
+ throw new IllegalArgumentException("Invalid data length " + length + ". It must be <= the length of the data array (" + data.length + ")");
+ }
+ m_data = data;
+ m_length = length;
+ m_scratchpad = sp;
+ }
+
+ /**
+ * Run a Burrows Wheeler encoding.
+ */
+ BurrowsWheelerEncodingResult encode()
+ {
+ // Create all rotations of m_data, put them in a matrix and sort the
+ // first column. For each row in the matrix, ptr contains a pointer to
+ // the first byte of the row's m_data rotation.
+ int[] ptr = new ThreeWayRadixQuicksort(m_data, m_length, MIN_QUICKSORT_LENGTH, m_scratchpad).sort();
+
+ // Get the contents of the last column in the matrix. This, and the
+ // pointer to the ĺocation of where the first byte in m_data is in the
+ // last column, is the result from the Burrows Wheeler encoding.
+ byte[] lastColumn = m_scratchpad.m_lastColumn;
+ int firstRow = -1;
+
+ for (int i = 0; i < m_length; i++)
+ {
+ int fePtr = ptr[i] - 1;
+ if (fePtr < 0)
+ {
+ fePtr += m_length;
+ firstRow = i;
+ }
+ lastColumn[i] = m_data[fePtr];
+ }
+ return new BurrowsWheelerEncodingResult(lastColumn, firstRow);
+ }
+}
diff --git a/src/main/java/org/at4j/comp/bzip2/CRC.java b/src/main/java/org/at4j/comp/bzip2/CRC.java
new file mode 100644
index 0000000..b7993c7
--- /dev/null
+++ b/src/main/java/org/at4j/comp/bzip2/CRC.java
@@ -0,0 +1,63 @@
+/* AT4J -- Archive file tools for Java -- http://www.at4j.org
+ * Copyright (C) 2009 Karl Gustafsson
+ *
+ * This file is a part of AT4J
+ *
+ * AT4J is free software: you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+package org.at4j.comp.bzip2;
+
+/**
+ * Checksum algorithm used by bzip2.
+ * @author Karl Gustafsson
+ * @since 1.1
+ */
+final class CRC
+{
+ // Table from bzip2's crctable.c
+ private static final int[] CRC_TABLE = new int[] { 0x00000000, 0x04c11db7, 0x09823b6e, 0x0d4326d9, 0x130476dc, 0x17c56b6b, 0x1a864db2, 0x1e475005, 0x2608edb8, 0x22c9f00f, 0x2f8ad6d6, 0x2b4bcb61, 0x350c9b64, 0x31cd86d3, 0x3c8ea00a,
+ 0x384fbdbd, 0x4c11db70, 0x48d0c6c7, 0x4593e01e, 0x4152fda9, 0x5f15adac, 0x5bd4b01b, 0x569796c2, 0x52568b75, 0x6a1936c8, 0x6ed82b7f, 0x639b0da6, 0x675a1011, 0x791d4014, 0x7ddc5da3, 0x709f7b7a, 0x745e66cd, 0x9823b6e0, 0x9ce2ab57,
+ 0x91a18d8e, 0x95609039, 0x8b27c03c, 0x8fe6dd8b, 0x82a5fb52, 0x8664e6e5, 0xbe2b5b58, 0xbaea46ef, 0xb7a96036, 0xb3687d81, 0xad2f2d84, 0xa9ee3033, 0xa4ad16ea, 0xa06c0b5d, 0xd4326d90, 0xd0f37027, 0xddb056fe, 0xd9714b49, 0xc7361b4c,
+ 0xc3f706fb, 0xceb42022, 0xca753d95, 0xf23a8028, 0xf6fb9d9f, 0xfbb8bb46, 0xff79a6f1, 0xe13ef6f4, 0xe5ffeb43, 0xe8bccd9a, 0xec7dd02d, 0x34867077, 0x30476dc0, 0x3d044b19, 0x39c556ae, 0x278206ab, 0x23431b1c, 0x2e003dc5, 0x2ac12072,
+ 0x128e9dcf, 0x164f8078, 0x1b0ca6a1, 0x1fcdbb16, 0x018aeb13, 0x054bf6a4, 0x0808d07d, 0x0cc9cdca, 0x7897ab07, 0x7c56b6b0, 0x71159069, 0x75d48dde, 0x6b93dddb, 0x6f52c06c, 0x6211e6b5, 0x66d0fb02, 0x5e9f46bf, 0x5a5e5b08, 0x571d7dd1,
+ 0x53dc6066, 0x4d9b3063, 0x495a2dd4, 0x44190b0d, 0x40d816ba, 0xaca5c697, 0xa864db20, 0xa527fdf9, 0xa1e6e04e, 0xbfa1b04b, 0xbb60adfc, 0xb6238b25, 0xb2e29692, 0x8aad2b2f, 0x8e6c3698, 0x832f1041, 0x87ee0df6, 0x99a95df3, 0x9d684044,
+ 0x902b669d, 0x94ea7b2a, 0xe0b41de7, 0xe4750050, 0xe9362689, 0xedf73b3e, 0xf3b06b3b, 0xf771768c, 0xfa325055, 0xfef34de2, 0xc6bcf05f, 0xc27dede8, 0xcf3ecb31, 0xcbffd686, 0xd5b88683, 0xd1799b34, 0xdc3abded, 0xd8fba05a, 0x690ce0ee,
+ 0x6dcdfd59, 0x608edb80, 0x644fc637, 0x7a089632, 0x7ec98b85, 0x738aad5c, 0x774bb0eb, 0x4f040d56, 0x4bc510e1, 0x46863638, 0x42472b8f, 0x5c007b8a, 0x58c1663d, 0x558240e4, 0x51435d53, 0x251d3b9e, 0x21dc2629, 0x2c9f00f0, 0x285e1d47,
+ 0x36194d42, 0x32d850f5, 0x3f9b762c, 0x3b5a6b9b, 0x0315d626, 0x07d4cb91, 0x0a97ed48, 0x0e56f0ff, 0x1011a0fa, 0x14d0bd4d, 0x19939b94, 0x1d528623, 0xf12f560e, 0xf5ee4bb9, 0xf8ad6d60, 0xfc6c70d7, 0xe22b20d2, 0xe6ea3d65, 0xeba91bbc,
+ 0xef68060b, 0xd727bbb6, 0xd3e6a601, 0xdea580d8, 0xda649d6f, 0xc423cd6a, 0xc0e2d0dd, 0xcda1f604, 0xc960ebb3, 0xbd3e8d7e, 0xb9ff90c9, 0xb4bcb610, 0xb07daba7, 0xae3afba2, 0xaafbe615, 0xa7b8c0cc, 0xa379dd7b, 0x9b3660c6, 0x9ff77d71,
+ 0x92b45ba8, 0x9675461f, 0x8832161a, 0x8cf30bad, 0x81b02d74, 0x857130c3, 0x5d8a9099, 0x594b8d2e, 0x5408abf7, 0x50c9b640, 0x4e8ee645, 0x4a4ffbf2, 0x470cdd2b, 0x43cdc09c, 0x7b827d21, 0x7f436096, 0x7200464f, 0x76c15bf8, 0x68860bfd,
+ 0x6c47164a, 0x61043093, 0x65c52d24, 0x119b4be9, 0x155a565e, 0x18197087, 0x1cd86d30, 0x029f3d35, 0x065e2082, 0x0b1d065b, 0x0fdc1bec, 0x3793a651, 0x3352bbe6, 0x3e119d3f, 0x3ad08088, 0x2497d08d, 0x2056cd3a, 0x2d15ebe3, 0x29d4f654,
+ 0xc5a92679, 0xc1683bce, 0xcc2b1d17, 0xc8ea00a0, 0xd6ad50a5, 0xd26c4d12, 0xdf2f6bcb, 0xdbee767c, 0xe3a1cbc1, 0xe760d676, 0xea23f0af, 0xeee2ed18, 0xf0a5bd1d, 0xf464a0aa, 0xf9278673, 0xfde69bc4, 0x89b8fd09, 0x8d79e0be, 0x803ac667,
+ 0x84fbdbd0, 0x9abc8bd5, 0x9e7d9662, 0x933eb0bb, 0x97ffad0c, 0xafb010b1, 0xab710d06, 0xa6322bdf, 0xa2f33668, 0xbcb4666d, 0xb8757bda, 0xb5365d03, 0xb1f740b4 };
+
+ private int m_crc = 0xFFFFFFFF;
+
+ /**
+ * @param b An integer value in the interval 0..255.
+ */
+ void update(int b)
+ {
+ if ((b < 0) || (b > 255))
+ {
+ throw new IllegalArgumentException("" + b);
+ }
+
+ m_crc = (m_crc << 8) ^ CRC_TABLE[(m_crc >>> 24) ^ b];
+ }
+
+ int getValue()
+ {
+ return ~m_crc;
+ }
+}
diff --git a/src/main/java/org/at4j/comp/bzip2/CompressedDataBlock.java b/src/main/java/org/at4j/comp/bzip2/CompressedDataBlock.java
new file mode 100644
index 0000000..7bfbcc0
--- /dev/null
+++ b/src/main/java/org/at4j/comp/bzip2/CompressedDataBlock.java
@@ -0,0 +1,51 @@
+/* AT4J -- Archive file tools for Java -- http://www.at4j.org
+ * Copyright (C) 2009 Karl Gustafsson
+ *
+ * This file is a part of AT4J
+ *
+ * AT4J is free software: you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+package org.at4j.comp.bzip2;
+
+import java.io.InputStream;
+
+/**
+ * A bzip2 block containing compressed data.
+ * @author Karl Gustafsson
+ * @since 1.1
+ */
+final class CompressedDataBlock implements Block
+{
+ private final InputStream m_stream;
+ private final int m_blockChecksum;
+
+ CompressedDataBlock(InputStream stream, int blockChecksum)
+ {
+ // Null check
+ stream.getClass();
+
+ m_stream = stream;
+ m_blockChecksum = blockChecksum;
+ }
+
+ InputStream getStream()
+ {
+ return m_stream;
+ }
+
+ int getBlockChecksum()
+ {
+ return m_blockChecksum;
+ }
+}
diff --git a/src/main/java/org/at4j/comp/bzip2/EncodedBlockData.java b/src/main/java/org/at4j/comp/bzip2/EncodedBlockData.java
new file mode 100644
index 0000000..baceb2f
--- /dev/null
+++ b/src/main/java/org/at4j/comp/bzip2/EncodedBlockData.java
@@ -0,0 +1,38 @@
+/* AT4J -- Archive file tools for Java -- http://www.at4j.org
+ * Copyright (C) 2009 Karl Gustafsson
+ *
+ * This file is a part of AT4J
+ *
+ * AT4J is free software: you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+package org.at4j.comp.bzip2;
+
+/**
+ * This object contains data for an encoded bzip2 block.
+ * @author Karl Gustafsson
+ * @since 1.1
+ */
+final class EncodedBlockData
+{
+ final byte[] m_bytes;
+ final int m_noBits;
+ final int m_bitValue;
+
+ EncodedBlockData(byte[] bytes, int noBits, int bitValue)
+ {
+ m_bytes = bytes;
+ m_noBits = noBits;
+ m_bitValue = bitValue;
+ }
+}
diff --git a/src/main/java/org/at4j/comp/bzip2/EncodedBlockWriter.java b/src/main/java/org/at4j/comp/bzip2/EncodedBlockWriter.java
new file mode 100644
index 0000000..6a34f68
--- /dev/null
+++ b/src/main/java/org/at4j/comp/bzip2/EncodedBlockWriter.java
@@ -0,0 +1,146 @@
+/* AT4J -- Archive file tools for Java -- http://www.at4j.org
+ * Copyright (C) 2009 Karl Gustafsson
+ *
+ * This file is a part of AT4J
+ *
+ * AT4J is free software: you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+package org.at4j.comp.bzip2;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.CountDownLatch;
+
+import org.at4j.support.io.BitOutput;
+
+/**
+ * This is used to write encoded blocks in the right order when several encoding
+ * threads are used with the {@link BZip2OutputStream}.
+ * @author Karl Gustafsson
+ * @since 1.1
+ */
+final class EncodedBlockWriter
+{
+ // All variables are protected by this object's intrinsic lock
+ private final BitOutput m_out;
+ private final Map m_savedBlocks = new HashMap();
+ // This latch is used to signal to the bzip2 output stream when this writer
+ // is finished.
+ private final CountDownLatch m_doneLatch = new CountDownLatch(1);
+ private int m_nextBlockToWrite = 0;
+ private boolean m_hasError;
+
+ EncodedBlockWriter(BitOutput out)
+ {
+ m_out = out;
+ }
+
+ private void writeEncodedBlockData(final EncodedBlockData bd) throws IOException
+ {
+ m_out.writeBytes(bd.m_bytes, 0, bd.m_bytes.length);
+ if (bd.m_noBits > 0)
+ {
+ m_out.writeBits(bd.m_bitValue, bd.m_noBits);
+ }
+ }
+
+ private void writeBlockInternal(final int blockNo, final EncodedBlockData blockData) throws IOException
+ {
+ if (blockData == null)
+ {
+ // We're done
+ m_doneLatch.countDown();
+ }
+ else
+ {
+ writeEncodedBlockData(blockData);
+
+ while (m_savedBlocks.containsKey(++m_nextBlockToWrite))
+ {
+ final EncodedBlockData savedBd = m_savedBlocks.get(m_nextBlockToWrite);
+ if (savedBd != null)
+ {
+ writeEncodedBlockData(savedBd);
+ }
+ else
+ {
+ m_doneLatch.countDown();
+ break;
+ }
+ }
+ }
+ }
+
+ /**
+ * It is not time to write this block just yet. Save it until it is time.
+ * @param blockNo The block number.
+ * @param blockData The block data.
+ */
+ private void saveBlock(final int blockNo, EncodedBlockData blockData)
+ {
+ m_savedBlocks.put(blockNo, blockData);
+ }
+
+ /**
+ * Write the block data to the output if it is the next block to write. If
+ * not, queue it for later writing.
+ * @param blockNo The block number.
+ * @param blockData The block data or {@code null} as an end of stream
+ * marker.
+ * @throws IOException
+ */
+ synchronized void writeBlock(final int blockNo, final EncodedBlockData blockData) throws IOException
+ {
+ if (m_hasError)
+ {
+ return;
+ }
+
+ try
+ {
+ if (blockNo == m_nextBlockToWrite)
+ {
+ writeBlockInternal(blockNo, blockData);
+ }
+ else
+ {
+ saveBlock(blockNo, blockData);
+ }
+ }
+ catch (Error e)
+ {
+ m_hasError = true;
+ m_doneLatch.countDown();
+ throw e;
+ }
+ catch (RuntimeException e)
+ {
+ m_hasError = true;
+ m_doneLatch.countDown();
+ throw e;
+ }
+ catch (IOException e)
+ {
+ m_hasError = true;
+ m_doneLatch.countDown();
+ throw e;
+ }
+ }
+
+ void waitFor() throws InterruptedException
+ {
+ m_doneLatch.await();
+ }
+}
diff --git a/src/main/java/org/at4j/comp/bzip2/EncodingScratchpad.java b/src/main/java/org/at4j/comp/bzip2/EncodingScratchpad.java
new file mode 100644
index 0000000..c882ce7
--- /dev/null
+++ b/src/main/java/org/at4j/comp/bzip2/EncodingScratchpad.java
@@ -0,0 +1,107 @@
+/* AT4J -- Archive file tools for Java -- http://www.at4j.org
+ * Copyright (C) 2009 Karl Gustafsson
+ *
+ * This file is a part of AT4J
+ *
+ * AT4J is free software: you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+package org.at4j.comp.bzip2;
+
+/**
+ * This object contains different objects used by a bzip2 encoder thread. It is
+ * used to reduce the number of object and array allocations.
+ * @author Karl Gustafsson
+ * @since 1.1
+ */
+final class EncodingScratchpad
+{
+ private static final int MAX_BLOCK_LENGTH = BZip2OutputStreamSettings.MAX_BLOCK_SIZE * 100 * 1000;
+ private static final int MAX_NO_OF_SEGMENTS = MAX_BLOCK_LENGTH / BlockEncoder.NO_OF_SYMBOLS_PER_SEGMENT;
+
+ // An array that may contain the frequencies of each symbol in the data.
+ final int[] m_frequencies = new int[BlockEncoder.MAX_NO_OF_MTF_SYMBOLS];
+
+ // A move to front alphabet.
+ final byte[] m_mtfAlphabet = new byte[BlockEncoder.MAX_NO_OF_MTF_SYMBOLS];
+
+ // This two dimensional array can contain the frequencies for the different
+ // symbols encoded by the different trees (up to six trees)
+ final int[][] m_frequencies2d = new int[BlockEncoder.MAX_NO_OF_HUFFMAN_TREES][BlockEncoder.MAX_NO_OF_MTF_SYMBOLS];
+
+ // Contains MTF and RL encoded data before the Huffman encoding. The maximum
+ // size is the maximum size of a block + the EOB symbol. The actual size
+ // will probably be significantly shorter than this
+ final int[] m_encodedData = new int[MAX_BLOCK_LENGTH + 1];
+
+ // Frequencies of each two-byte combination used for the radix sort.
+ // Use an overshoot of one position.
+ final int[] m_twoByteFrequencies = new int[65536 + 1];
+
+ // Pointers created by the 3-way radix quicksort
+ final int[] m_ptrs = new int[MAX_BLOCK_LENGTH];
+
+ // A cache for sort results
+ final int[] m_sortCache = new int[MAX_BLOCK_LENGTH + ThreeWayRadixQuicksort.DATA_OVERSHOOT];
+
+ // Array for temporary data. This will be grown incrementally as the need
+ // arises.
+ int[] m_tempArea = new int[1024];
+
+ // Stack for block sorting
+ final ThreeWayRadixQuicksort.QuickSortRangeInfo[] m_sortStack = new ThreeWayRadixQuicksort.QuickSortRangeInfo[ThreeWayRadixQuicksort.SORT_STACK_SIZE];
+
+ // The results when all segments of a block is encoded with all available
+ // Huffman trees
+ final int[][] m_encodingResults = new int[MAX_NO_OF_SEGMENTS][BlockEncoder.MAX_NO_OF_HUFFMAN_TREES];
+
+ final int[] m_categoriesPerSegment = new int[MAX_NO_OF_SEGMENTS];
+
+ // The last column after Burrows Wheeler encoding
+ final byte[] m_lastColumn = new byte[MAX_BLOCK_LENGTH];
+
+ // The bucket sorting order
+ final int[] m_sortOrder = new int[256];
+ // Used when scanning pointers
+ final int[] m_copyStart = new int[256];
+ final int[] m_copyEnd = new int[256];
+
+ // Mapping between a symbol and its index number in the array of symbols
+ // used by the run length encoder.
+ final byte[] m_sequenceMap = new byte[256];
+
+ // Heap used when calculating Huffman tree code lengths
+ final int[] m_htHeap = new int[BlockEncoder.MAX_NO_OF_MTF_SYMBOLS + 2];
+ final int[] m_htWeight = new int[BlockEncoder.MAX_NO_OF_MTF_SYMBOLS * 2];
+ final int[] m_htParent = new int[BlockEncoder.MAX_NO_OF_MTF_SYMBOLS * 2];
+
+ // Flags for all sorted large buckets
+ final boolean[] m_sortedLargeBuckets = new boolean[256];
+ // Flags for all sorted small buckets
+ final boolean[] m_sortedSmallBuckets = new boolean[256 * 256];
+
+ /**
+ * Get a temporary integer array of with a length of at least {@code len}
+ * integers.
+ */
+ int[] getTemp(final int len)
+ {
+ // Is the current temp area large enough?
+ if (m_tempArea.length < len)
+ {
+ // No. Reallocate it
+ m_tempArea = new int[len + 100];
+ }
+ return m_tempArea;
+ }
+}
diff --git a/src/main/java/org/at4j/comp/bzip2/EncodingThread.java b/src/main/java/org/at4j/comp/bzip2/EncodingThread.java
new file mode 100644
index 0000000..7203639
--- /dev/null
+++ b/src/main/java/org/at4j/comp/bzip2/EncodingThread.java
@@ -0,0 +1,49 @@
+/* AT4J -- Archive file tools for Java -- http://www.at4j.org
+ * Copyright (C) 2009 Karl Gustafsson
+ *
+ * This file is a part of AT4J
+ *
+ * AT4J is free software: you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+package org.at4j.comp.bzip2;
+
+/**
+ * This is the kind of thread used for encoding bzip2 blocks.
+ * @author Karl Gustafsson
+ * @since 1.1
+ */
+final class EncodingThread extends Thread
+{
+ private final EncodingScratchpad m_scratchpad = new EncodingScratchpad();
+ private final ErrorState m_errorState;
+
+ EncodingThread(Runnable r, ErrorState es)
+ {
+ super(r);
+ m_errorState = es;
+ }
+
+ /**
+ * Get this thread's scratchpad.
+ */
+ EncodingScratchpad getScratchpad()
+ {
+ return m_scratchpad;
+ }
+
+ ErrorState getErrorState()
+ {
+ return m_errorState;
+ }
+}
diff --git a/src/main/java/org/at4j/comp/bzip2/EncodingThreadFactory.java b/src/main/java/org/at4j/comp/bzip2/EncodingThreadFactory.java
new file mode 100644
index 0000000..bea7cf5
--- /dev/null
+++ b/src/main/java/org/at4j/comp/bzip2/EncodingThreadFactory.java
@@ -0,0 +1,41 @@
+/* AT4J -- Archive file tools for Java -- http://www.at4j.org
+ * Copyright (C) 2009 Karl Gustafsson
+ *
+ * This file is a part of AT4J
+ *
+ * AT4J is free software: you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+package org.at4j.comp.bzip2;
+
+import java.util.concurrent.ThreadFactory;
+
+/**
+ * This is a factory for creating {@link EncodingThread} objects.
+ * @author Karl Gustafsson
+ * @since 1.1
+ */
+final class EncodingThreadFactory implements ThreadFactory
+{
+ private final ErrorState m_errorState;
+
+ EncodingThreadFactory(ErrorState es)
+ {
+ m_errorState = es;
+ }
+
+ public Thread newThread(Runnable r)
+ {
+ return new EncodingThread(r, m_errorState);
+ }
+}
diff --git a/src/main/java/org/at4j/comp/bzip2/EosBlock.java b/src/main/java/org/at4j/comp/bzip2/EosBlock.java
new file mode 100644
index 0000000..9871d00
--- /dev/null
+++ b/src/main/java/org/at4j/comp/bzip2/EosBlock.java
@@ -0,0 +1,39 @@
+/* AT4J -- Archive file tools for Java -- http://www.at4j.org
+ * Copyright (C) 2009 Karl Gustafsson
+ *
+ * This file is a part of AT4J
+ *
+ * AT4J is free software: you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+package org.at4j.comp.bzip2;
+
+/**
+ * A bzip2 block containing end of stream information.
+ * @author Karl Gustafsson
+ * @since 1.1
+ */
+final class EosBlock implements Block
+{
+ private final long m_readCrc;
+
+ EosBlock(long readCrc)
+ {
+ m_readCrc = readCrc;
+ }
+
+ long getReadCrc()
+ {
+ return m_readCrc;
+ }
+}
diff --git a/src/main/java/org/at4j/comp/bzip2/ErrorState.java b/src/main/java/org/at4j/comp/bzip2/ErrorState.java
new file mode 100644
index 0000000..3cb97c6
--- /dev/null
+++ b/src/main/java/org/at4j/comp/bzip2/ErrorState.java
@@ -0,0 +1,52 @@
+/* AT4J -- Archive file tools for Java -- http://www.at4j.org
+ * Copyright (C) 2009 Karl Gustafsson
+ *
+ * This file is a part of AT4J
+ *
+ * AT4J is free software: you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+package org.at4j.comp.bzip2;
+
+import java.io.IOException;
+
+/**
+ * This is used to keep track of encoding errors.
+ *
+ * Every error is registered with an owner token that is a unique identifier for
+ * the object that is affected by the error. The owner token object must have a
+ * good {@link Object#hashCode()} method.
+ * @author Karl Gustafsson
+ * @since 1.1
+ */
+interface ErrorState
+{
+ /**
+ * Register an {@link Exception} or an {@link Error}.
+ * @param t The exception or error.
+ * @param ownerToken A unique identifier for the error owner, i.e. the
+ * object that the encoding thread is performing work for.
+ */
+ void registerError(Throwable t, Object ownerToken);
+
+ /**
+ * Check for errors.
+ * @param ownerToken The owner.
+ * @throws Error If there is a registered {@link Error} for this owner.
+ * @throws RuntimeException If there is a registered
+ * {@link RuntimeException} for this owner.
+ * @throws IOException If there is a registered {@link IOException} for this
+ * owner.
+ */
+ void checkAndClearErrors(Object ownerToken) throws Error, RuntimeException, IOException;
+}
diff --git a/src/main/java/org/at4j/comp/bzip2/HighValueBranchHuffmanTree.java b/src/main/java/org/at4j/comp/bzip2/HighValueBranchHuffmanTree.java
new file mode 100644
index 0000000..fb34bd3
--- /dev/null
+++ b/src/main/java/org/at4j/comp/bzip2/HighValueBranchHuffmanTree.java
@@ -0,0 +1,438 @@
+/* AT4J -- Archive file tools for Java -- http://www.at4j.org
+ * Copyright (C) 2009 Karl Gustafsson
+ *
+ * This file is a part of AT4J
+ *
+ * AT4J is free software: you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+package org.at4j.comp.bzip2;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.at4j.support.io.BitInput;
+import org.at4j.support.io.BitOutput;
+
+/**
+ * This object represents the type of Huffman tree that is used by bzip2. The
+ * "high value branch" means that leaf nodes have the smallest possible values
+ * and non-leaf nodes have the highest possible values at each tree depth.
+ * @author Karl Gustafsson
+ * @since 1.1
+ */
+final class HighValueBranchHuffmanTree
+{
+ private static final int MAX_NO_OF_SYMBOLS = 258;
+
+ // The shortest code length for symbols in this tree.
+ private final int m_minLength;
+ // The longest code length for symbols in this tree.
+ private final int m_maxLength;
+ // m_maxLength - m_minLength + 1;
+ // Declared package private for the unit tests.
+ final int m_numberOfLengths;
+
+ // The value limit at each data length, i.e. the maximum value for leaf
+ // nodes at that data length.
+ // Declared package private for the unit tests.
+ final int[] m_limitsPerLength;
+ // The lowest value for a symbol at each length. The value for length
+ // m_minLength is at index 0 in the array.
+ // Declared package private for the unit tests.
+ final int[] m_baseValuesPerLength;
+ // The offset in the m_symbolSequenceNos array for the first symbol for each
+ // Huffman code length. The array has the length m_maxLength - m_minLength +
+ // 1. The value for m_minLength is at index 0 (and is 0).
+ // Declared package private for the unit tests.
+ final int[] m_symbolOffsetPerLength;
+ // The index of the symbol table for Huffman code no n.
+ // Declared package private for the unit tests.
+ final int[] m_symbolSequenceNos;
+ // This table contains the Huffman codes and the code bit lengths for each
+ // symbol. It is created when using the constructor that calculates the
+ // Huffman trees to speed up encoding.
+ final int[][] m_huffmanCodesAndLengthsPerSymbol;
+
+ /**
+ * Get the Huffman code and its bit length for a symbol.
+ * @param symbol The symbol.
+ * @param huffmanIndex The symbol's index in the list of sorted symbols.
+ * @param codeAndLength An int array of length 2 used to store the result
+ * in.
+ */
+ private int[] getCodeAndLengthForSymbol(final int symbol, final int huffmanIndex, final int[] codeAndLength)
+ {
+ // Calculate the length of the synbol's Huffman code
+ int deltaLen;
+ for (deltaLen = 0; deltaLen < m_numberOfLengths - 1; deltaLen++)
+ {
+ if (huffmanIndex < m_symbolOffsetPerLength[deltaLen + 1])
+ {
+ break;
+ }
+ }
+
+ codeAndLength[0] = m_baseValuesPerLength[deltaLen] + (huffmanIndex - m_symbolOffsetPerLength[deltaLen]);
+ codeAndLength[1] = m_minLength + deltaLen;
+ return codeAndLength;
+ }
+
+ /**
+ * Create a canonical Huffman tree for the supplied symbols.
+ *
+ * Symbol lengths for a canonical Huffman tree can be created by the
+ * {@link #createCodeLengths(int[], int, int)} method.
+ * @param symbolLengths The length of the Huffman code for each symbol.
+ * @param minLength The shortest Huffman code length in the tree.
+ * @param maxLength The longest Huffman code length in the tree.
+ * @param forEncoding Should the tree be used for encoding? If so, a loookup
+ * table that contains the Huffman code for each symbol is created to speed
+ * up the encoding.
+ * @throws IllegalArgumentException If the lengths are invalid.
+ */
+ HighValueBranchHuffmanTree(final int[] symbolLengths, final int minLength, final int maxLength, final boolean forEncoding) throws IllegalArgumentException
+ {
+ if ((minLength < 0) || (maxLength < minLength))
+ {
+ throw new IllegalArgumentException("Illegal min or max length, min: " + minLength + ", max: " + maxLength);
+ }
+
+ final int numberOfSymbols = symbolLengths.length;
+ final int numberOfLengths = maxLength - minLength + 1;
+ // Create a array of symbol sequence numbers sorted on their symbol
+ // lengths
+ m_symbolSequenceNos = new int[numberOfSymbols];
+ // The number of symbols having each code length
+ final int[] numl = new int[numberOfLengths];
+ int index = 0;
+ for (int i = minLength; i <= maxLength; i++)
+ {
+ numl[i - minLength] = 0;
+ for (int j = 0; j < numberOfSymbols; j++)
+ {
+ if (symbolLengths[j] == i)
+ {
+ m_symbolSequenceNos[index++] = j;
+ numl[i - minLength]++;
+ }
+ }
+ }
+
+ m_symbolOffsetPerLength = new int[numberOfLengths];
+ m_symbolOffsetPerLength[0] = 0;
+ for (int i = 0; i < numberOfLengths - 1; i++)
+ {
+ m_symbolOffsetPerLength[i + 1] = m_symbolOffsetPerLength[i] + numl[i];
+ }
+
+ // The value limit at each length
+ m_limitsPerLength = new int[numberOfLengths - 1];
+ m_baseValuesPerLength = new int[numberOfLengths];
+ int prevLimit = 0;
+ for (int i = minLength; i <= maxLength; i++)
+ {
+ index = i - minLength;
+ // The base value for this length is the value of the smallest
+ // allowed symbol for this length. The smallest allowed symbol is
+ // the limit for the previous length with a zero at the end.
+ m_baseValuesPerLength[index] = prevLimit << 1;
+
+ if (i < maxLength)
+ {
+ // The limit for this length is the base value for this length
+ // plus the number of symbols for this length.
+ prevLimit = m_baseValuesPerLength[index] + numl[index];
+ m_limitsPerLength[index] = prevLimit - 1;
+ }
+ }
+
+ m_minLength = minLength;
+ m_maxLength = maxLength;
+ m_numberOfLengths = (byte) (maxLength - minLength + 1);
+ if (forEncoding)
+ {
+ // Create an inverse mapping into the list of sorted symbols
+ final int[] huffmanIndexPerSymbol = new int[symbolLengths.length];
+ Arrays.fill(huffmanIndexPerSymbol, -1);
+ for (int i = 0; i < m_symbolSequenceNos.length; i++)
+ {
+ huffmanIndexPerSymbol[m_symbolSequenceNos[i]] = i;
+ }
+
+ // Create a table containing the Huffman code and its bit length for
+ // each symbol. This is used to speed up writes.
+ m_huffmanCodesAndLengthsPerSymbol = new int[symbolLengths.length][2];
+ int[] codeAndLength = new int[2];
+ for (int i = 0; i < symbolLengths.length; i++)
+ {
+ codeAndLength = getCodeAndLengthForSymbol(i, huffmanIndexPerSymbol[i], codeAndLength);
+ m_huffmanCodesAndLengthsPerSymbol[i][0] = codeAndLength[0];
+ m_huffmanCodesAndLengthsPerSymbol[i][1] = codeAndLength[1];
+ }
+ }
+ else
+ {
+ // Don't create these variables. They are only used when writing data
+ // and it is assumed that this constructor will only be used to create
+ // trees for reading data.
+ m_huffmanCodesAndLengthsPerSymbol = null;
+ }
+ }
+
+ private static void upHeap(final int[] heap, final int[] weight, int nHeap)
+ {
+ int tmp = heap[nHeap];
+ while (weight[tmp] < weight[heap[nHeap >> 1]])
+ {
+ heap[nHeap] = heap[nHeap >>> 1];
+ nHeap >>>= 1;
+ }
+ heap[nHeap] = tmp;
+ }
+
+ private static void downHeap(final int[] heap, final int[] weight, final int nHeap, int n)
+ {
+ int tmp = heap[n];
+ while (true)
+ {
+ int yy = n << 1;
+ if (yy > nHeap)
+ {
+ break;
+ }
+ if (yy < nHeap && weight[heap[yy + 1]] < weight[heap[yy]])
+ {
+ yy++;
+ }
+ if (weight[tmp] < weight[heap[yy]])
+ {
+ break;
+ }
+ heap[n] = heap[yy];
+ n = yy;
+ }
+ heap[n] = tmp;
+ }
+
+ private static int addWeights(final int w1, final int w2)
+ {
+ final int d1 = w1 & 0xFF;
+ final int d2 = w2 & 0xFF;
+ final int ww1 = w1 & 0xFFFFFF00;
+ final int ww2 = w2 & 0xFFFFFF00;
+ return (ww1 + ww2) | (1 + (d1 > d2 ? d1 : d2));
+ }
+
+ int getMinLength()
+ {
+ return m_minLength;
+ }
+
+ int getMaxLength()
+ {
+ return m_maxLength;
+ }
+
+ /**
+ * Get a sorted array with symbol sequence numbers and their Huffman code
+ * lengths. The returned array is sorted with the most frequent occurring
+ * symbol first (i.e. the symbol with the shortest Huffman code).
+ *
+ * This method is used for testing.
+ * @return Array a[n][0] = symbol, a[n][1] = Huffman code length
+ */
+ int[][] getSortedSymbolSequenceNosAndCodeLengths()
+ {
+ int[][] res = new int[m_symbolSequenceNos.length][2];
+ int length = m_minLength;
+ for (int i = 0; i < m_symbolSequenceNos.length; i++)
+ {
+ while ((length < m_maxLength) && (i >= m_symbolOffsetPerLength[length - m_minLength + 1]))
+ {
+ length++;
+ }
+ res[i][0] = m_symbolSequenceNos[i];
+ res[i][1] = length;
+ }
+ return res;
+ }
+
+ /**
+ * Read the next symbol.
+ * @param in The input to read the symbol from.
+ * @return The next symbol.
+ * @throws IOException On I/O errors.
+ */
+ int readNext(final BitInput in) throws IOException
+ {
+ int code = in.readBits(m_minLength);
+ // m_limitsPerLength.length == 0 means that all Huffman codes have the
+ // same length.
+ if (m_limitsPerLength.length == 0 || code <= m_limitsPerLength[0])
+ {
+ return m_symbolSequenceNos[code];
+ }
+ else
+ {
+ int codeLength = m_minLength;
+ int index = 1;
+ while (true)
+ {
+ code = (code << 1) | (in.readBit() ? 1 : 0);
+ codeLength++;
+ if ((codeLength == m_maxLength) || (code <= m_limitsPerLength[index]))
+ {
+ return m_symbolSequenceNos[m_symbolOffsetPerLength[index] + (code - m_baseValuesPerLength[index])];
+ }
+ index++;
+ }
+ }
+ }
+
+ /**
+ * Write a symbol.
+ * @param out The output to write to.
+ * @param symbol The symbol to write.
+ * @throws IOException On I/O errors.
+ */
+ void write(final BitOutput out, final int symbol) throws IOException
+ {
+ out.writeBitsLittleEndian(m_huffmanCodesAndLengthsPerSymbol[symbol][0], m_huffmanCodesAndLengthsPerSymbol[symbol][1]);
+ }
+
+ /**
+ * Get the number of bits used for encoding the symbol.
+ */
+ int getBitLength(int symbol)
+ {
+ return m_huffmanCodesAndLengthsPerSymbol[symbol][1];
+ }
+
+ /**
+ * Calculate the Huffman code lengths for the optimal, depth-limited Huffman
+ * tree for the supplied symbol frequencies.
+ *
+ * This method uses the (slightly magic) algorithm from bzip2 1.0.5.
+ * @param frequencies The frequencies for each symbol in the data to be
+ * encoded.
+ * @param noSymbols The number of different symbols in the data to encode.
+ * This should be the maximum symbol value (the EOB symbol's value) + 1.
+ * @param maxLength The maximum code length which also will be the depth of
+ * the Huffman tree. If this is too small, this method will get stuck in an
+ * infinite loop.
+ * @return The Huffman code lengths for each symbol.
+ */
+ static int[] createCodeLengths(final int[] frequencies, final int noSymbols, final int maxLength, final EncodingScratchpad scratchpad)
+ {
+ /*
+ * Nodes and heap entries run from 1. Entry 0 for both the heap and
+ * nodes is a sentinel.
+ */
+
+ final int[] heap = scratchpad.m_htHeap;
+ final int[] weight = scratchpad.m_htWeight;
+ final int[] parent = scratchpad.m_htParent;
+
+ final int[] res = new int[noSymbols];
+
+ int actualMaxLength = -1;
+ int actualMinLength = Integer.MAX_VALUE;
+
+ for (int i = 0; i < noSymbols; i++)
+ {
+ weight[i + 1] = (frequencies[i] == 0 ? 1 : frequencies[i]) << 8;
+ }
+
+ while (true)
+ {
+ int noNodes = noSymbols;
+ int nHeap = 0;
+
+ heap[0] = 0;
+ weight[0] = 0;
+ parent[0] = -2;
+
+ for (int i = 1; i <= noSymbols; i++)
+ {
+ parent[i] = -1;
+ nHeap++;
+ heap[nHeap] = i;
+ upHeap(heap, weight, nHeap);
+ }
+
+ assert nHeap < MAX_NO_OF_SYMBOLS + 2;
+
+ while (nHeap > 1)
+ {
+ int n1 = heap[1];
+ heap[1] = heap[nHeap];
+ nHeap--;
+ downHeap(heap, weight, nHeap, 1);
+ int n2 = heap[1];
+ heap[1] = heap[nHeap];
+ nHeap--;
+ downHeap(heap, weight, nHeap, 1);
+ noNodes++;
+ parent[n1] = parent[n2] = noNodes;
+ weight[noNodes] = addWeights(weight[n1], weight[n2]);
+ parent[noNodes] = -1;
+ nHeap++;
+ heap[nHeap] = noNodes;
+ upHeap(heap, weight, nHeap);
+ }
+
+ assert noNodes < MAX_NO_OF_SYMBOLS * 2;
+
+ boolean tooLong = false;
+ INNER: for (int i = 1; i <= noSymbols; i++)
+ {
+ int j = 0;
+ int k = i;
+ while (parent[k] >= 0)
+ {
+ k = parent[k];
+ j++;
+ }
+ res[i - 1] = j;
+ if (j > maxLength)
+ {
+ tooLong = true;
+ break INNER;
+ }
+
+ if (j > actualMaxLength)
+ {
+ actualMaxLength = j;
+ }
+ if (j < actualMinLength)
+ {
+ actualMinLength = j;
+ }
+ }
+
+ if (!tooLong)
+ {
+ break;
+ }
+
+ for (int i = 1; i <= noSymbols; i++)
+ {
+ int j = weight[i] >> 8;
+ j = 1 + (j / 2);
+ weight[i] = j << 8;
+ }
+ }
+ return res;
+ }
+}
diff --git a/src/main/java/org/at4j/comp/bzip2/MultipleObserverErrorState.java b/src/main/java/org/at4j/comp/bzip2/MultipleObserverErrorState.java
new file mode 100644
index 0000000..b7ac53d
--- /dev/null
+++ b/src/main/java/org/at4j/comp/bzip2/MultipleObserverErrorState.java
@@ -0,0 +1,67 @@
+/* AT4J -- Archive file tools for Java -- http://www.at4j.org
+ * Copyright (C) 2009 Karl Gustafsson
+ *
+ * This file is a part of AT4J
+ *
+ * AT4J is free software: you can redistribute it and/or modify it under the
+ * terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation, either version 3 of the License, or (at your option)
+ * any later version.
+ *
+ * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+package org.at4j.comp.bzip2;
+
+import java.io.IOException;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+
+/**
+ * This {@link ErrorState} may have several observers which forces us to have to
+ * care about the owner of each registered error.
+ *
+ * This is used when sharing the same
+ * {@link java.util.concurrent.ExecutorService} between several
+ * {@link BZip2OutputStream}:s.
+ * @author Karl Gustafsson
+ * @since 1.1
+ */
+final class MultipleObserverErrorState implements ErrorState
+{
+ private Map