diff --git a/build.gradle b/build.gradle index 67c0f6d..bb2881b 100644 --- a/build.gradle +++ b/build.gradle @@ -41,6 +41,7 @@ dependencies { include "org.tukaani:xz:1.8" modCompile 'com.github.shevek:parallelgzip:master-SNAPSHOT' + include 'com.github.shevek:parallelgzip:master-SNAPSHOT' } processResources { diff --git a/src/Copyright_Notice.txt b/src/Copyright_Notice.txt index b755408..4740a3f 100644 --- a/src/Copyright_Notice.txt +++ b/src/Copyright_Notice.txt @@ -1,7 +1,9 @@ This project uses third party libraries as its dependencies and includes them in jar. Those are : - Apache Commons Compress version 1.20 licensed under Apache License Version 2.0 which can be found at http://www.apache.org/licenses/ + Apache Commons Compress licensed under Apache License Version 2.0 which can be found at http://www.apache.org/licenses/ Cotton config, Cotton logging, and Jankson-Fabric all by Cotton team licensed under MIT license which can be found at https://github.com/CottonMC/Cotton XZ for Java by Tukaani released as public domain. https://tukaani.org/xz/java.html + parallelgzip by shevek under Apache 2.0 http://www.apache.org/licenses/ + Parallel BZip2 compression by Karl Gustafsson at http://at4j.sourceforge.net/ under GPL v3 Some code was partially or fully inspired by: Parallel zip compression: https://stackoverflow.com/questions/54624695/how-to-implement-parallel-zip-creation-with-scatterzipoutputstream-with-zip64-su diff --git a/src/main/java/net/szum123321/textile_backup/core/MakeBackupThread.java b/src/main/java/net/szum123321/textile_backup/core/MakeBackupThread.java index 4cb4677..d5700ea 100644 --- a/src/main/java/net/szum123321/textile_backup/core/MakeBackupThread.java +++ b/src/main/java/net/szum123321/textile_backup/core/MakeBackupThread.java @@ -23,11 +23,11 @@ import net.minecraft.server.command.ServerCommandSource; import net.minecraft.world.dimension.DimensionType; import net.szum123321.textile_backup.TextileBackup; import net.szum123321.textile_backup.core.compressors.GenericTarCompressor; +import net.szum123321.textile_backup.core.compressors.ParallelBZip2Compressor; import net.szum123321.textile_backup.core.compressors.ParallelZipCompressor; import org.anarres.parallelgzip.ParallelGZIPOutputStream; -import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; -import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream; import org.apache.commons.compress.compressors.xz.XZCompressorOutputStream; +import org.at4j.comp.bzip2.BZip2OutputStream; import java.io.File; import java.io.IOException; @@ -72,7 +72,7 @@ public class MakeBackupThread implements Runnable { break; case BZIP2: - GenericTarCompressor.createArchive(world, outFile, BZip2CompressorOutputStream.class, ctx); + ParallelBZip2Compressor.createArchive(world, outFile, ctx); break; case GZIP: diff --git a/src/main/java/net/szum123321/textile_backup/core/compressors/GenericTarCompressor.java b/src/main/java/net/szum123321/textile_backup/core/compressors/GenericTarCompressor.java index 5c07d06..85dffb7 100644 --- a/src/main/java/net/szum123321/textile_backup/core/compressors/GenericTarCompressor.java +++ b/src/main/java/net/szum123321/textile_backup/core/compressors/GenericTarCompressor.java @@ -5,7 +5,6 @@ import net.szum123321.textile_backup.TextileBackup; import net.szum123321.textile_backup.core.Utilities; import org.apache.commons.compress.archivers.ArchiveEntry; import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream; -import org.apache.commons.compress.compressors.CompressorOutputStream; import org.apache.commons.compress.utils.IOUtils; @@ -29,15 +28,15 @@ public class GenericTarCompressor { File input = in.getCanonicalFile(); - Files.walk(input.toPath()).filter( - path -> !path.equals(input.toPath()) && - path.toFile().isFile() && - !Utilities.isBlacklisted(input.toPath().relativize(path)) + Files.walk(input.toPath() + ).filter(path -> !path.equals(input.toPath()) && + path.toFile().isFile() && + !Utilities.isBlacklisted(input.toPath().relativize(path)) ).forEach(path -> { File file = path.toAbsolutePath().toFile(); try (FileInputStream fin = new FileInputStream(file); - BufferedInputStream bfin = new BufferedInputStream(fin)){ + BufferedInputStream bfin = new BufferedInputStream(fin)) { ArchiveEntry entry = arc.createArchiveEntry(file, input.toPath().relativize(path).toString()); arc.putArchiveEntry(entry); diff --git a/src/main/java/net/szum123321/textile_backup/core/compressors/ParallelBZip2Compressor.java b/src/main/java/net/szum123321/textile_backup/core/compressors/ParallelBZip2Compressor.java new file mode 100644 index 0000000..bc0c3c2 --- /dev/null +++ b/src/main/java/net/szum123321/textile_backup/core/compressors/ParallelBZip2Compressor.java @@ -0,0 +1,62 @@ +package net.szum123321.textile_backup.core.compressors; + +import net.minecraft.server.command.ServerCommandSource; +import net.szum123321.textile_backup.TextileBackup; +import net.szum123321.textile_backup.core.Utilities; +import org.apache.commons.compress.archivers.ArchiveEntry; +import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream; +import org.apache.commons.compress.utils.IOUtils; +import org.at4j.comp.bzip2.BZip2OutputStream; +import org.at4j.comp.bzip2.BZip2OutputStreamSettings; + +import java.io.*; +import java.nio.file.Files; + +public class ParallelBZip2Compressor { + public static void createArchive(File in, File out, ServerCommandSource ctx) { + Utilities.log("Starting compression...", ctx); + + BZip2OutputStreamSettings settings = new BZip2OutputStreamSettings().setNumberOfEncoderThreads(Runtime.getRuntime().availableProcessors()); + + long start = System.nanoTime(); + + try (FileOutputStream fileOutputStream = new FileOutputStream(out); + BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(fileOutputStream); + BZip2OutputStream bZip2OutputStream = new BZip2OutputStream(bufferedOutputStream, settings); + TarArchiveOutputStream arc = new TarArchiveOutputStream(bZip2OutputStream)) { + + arc.setLongFileMode(TarArchiveOutputStream.LONGFILE_POSIX); + arc.setBigNumberMode(TarArchiveOutputStream.BIGNUMBER_POSIX); + + File input = in.getCanonicalFile(); + + Files.walk(input.toPath() + ).filter(path -> !path.equals(input.toPath()) && + path.toFile().isFile() && + !Utilities.isBlacklisted(input.toPath().relativize(path)) + ).forEach(path -> { + File file = path.toAbsolutePath().toFile(); + + try (FileInputStream fin = new FileInputStream(file); + BufferedInputStream bfin = new BufferedInputStream(fin)) { + ArchiveEntry entry = arc.createArchiveEntry(file, input.toPath().relativize(path).toString()); + + arc.putArchiveEntry(entry); + IOUtils.copy(bfin, arc); + + arc.closeArchiveEntry(); + } catch (IOException e) { + TextileBackup.logger.error(e.getMessage()); + } + }); + + arc.finish(); + } catch (IOException e) { + e.printStackTrace(); + } + + long end = System.nanoTime(); + + Utilities.log("Compression took: " + ((end - start) / 1000000000.0) + "s", ctx); + } +} \ No newline at end of file diff --git a/src/main/java/net/szum123321/textile_backup/core/compressors/ParallelZipCompressor.java b/src/main/java/net/szum123321/textile_backup/core/compressors/ParallelZipCompressor.java index 867c175..11a8ae5 100644 --- a/src/main/java/net/szum123321/textile_backup/core/compressors/ParallelZipCompressor.java +++ b/src/main/java/net/szum123321/textile_backup/core/compressors/ParallelZipCompressor.java @@ -24,7 +24,7 @@ public class ParallelZipCompressor { public static void createArchive(File in, File out, ServerCommandSource ctx) { Utilities.log("Starting compression...", ctx); - long start = System.nanoTime();; + long start = System.nanoTime(); try (FileOutputStream fileOutputStream = new FileOutputStream(out); BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(fileOutputStream); @@ -39,15 +39,15 @@ public class ParallelZipCompressor { File input = in.getCanonicalFile(); - Files.walk(input.toPath()).filter( - path -> !path.equals(input.toPath()) && - path.toFile().isFile() && - !Utilities.isBlacklisted(input.toPath().relativize(path)) + Files.walk(input.toPath() + ).filter(path -> !path.equals(input.toPath()) && + path.toFile().isFile() && + !Utilities.isBlacklisted(input.toPath().relativize(path)) ).forEach(p -> { - ZipArchiveEntry entry = new ZipArchiveEntry(input.toPath().relativize(p).toString()); - entry.setMethod(ZipEntry.DEFLATED); - FileInputStreamSupplier supplier = new FileInputStreamSupplier(p); - scatterZipCreator.addArchiveEntry(entry, supplier); + ZipArchiveEntry entry = new ZipArchiveEntry(input.toPath().relativize(p).toString()); + entry.setMethod(ZipEntry.DEFLATED); + FileInputStreamSupplier supplier = new FileInputStreamSupplier(p); + scatterZipCreator.addArchiveEntry(entry, supplier); }); scatterZipCreator.writeTo(arc); diff --git a/src/main/java/org/at4j/comp/CompressionLevel.java b/src/main/java/org/at4j/comp/CompressionLevel.java new file mode 100644 index 0000000..2f1fed0 --- /dev/null +++ b/src/main/java/org/at4j/comp/CompressionLevel.java @@ -0,0 +1,43 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.comp; + +/** + * This is an enumeration over different generic compression levels supported by + * some of At4J's compression algorithm. + * @author Karl Gustafsson + * @since 1.0.2 + */ +public enum CompressionLevel +{ + BEST("best"), DEFAULT("default"), FASTEST("fastest"); + + private final String m_tag; + + private CompressionLevel(String tag) + { + m_tag = tag; + } + + @Override + public String toString() + { + return m_tag + " compression"; + } +} diff --git a/src/main/java/org/at4j/comp/bzip2/BZip2EncoderExecutorService.java b/src/main/java/org/at4j/comp/bzip2/BZip2EncoderExecutorService.java new file mode 100644 index 0000000..8345d82 --- /dev/null +++ b/src/main/java/org/at4j/comp/bzip2/BZip2EncoderExecutorService.java @@ -0,0 +1,50 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.comp.bzip2; + +/** + * This interface identifies an executor service that is used to spread the + * encoding of bzip2 blocks over several threads. It can be used to speed up + * bzip2 encoding. + *

+ * The executor service spreads the work over all threads available to it. If a + * {@link BZip2OutputStream} submits more work when all threads are busy, the + * call blocks until the next thread becomes available. + *

+ * When the client is done using the executor, it must call {@link #shutdown()} + * to release all of its resources. + *

+ * An executor service instance can be had from the + * {@link BZip2OutputStream#createExecutorService(int)} method. + *

+ * This interface does not expose any methods except the {@link #shutdown()} + * method and there is no way of making a custom executor service + * implementation. + * @author Karl Gustafsson + * @since 1.1 + */ +public interface BZip2EncoderExecutorService +{ + /** + * This method should be called when the executor service is no longer + * needed. It terminates all threads and releases all other resources + * associated with the executor. + */ + void shutdown(); +} diff --git a/src/main/java/org/at4j/comp/bzip2/BZip2EncoderExecutorServiceImpl.java b/src/main/java/org/at4j/comp/bzip2/BZip2EncoderExecutorServiceImpl.java new file mode 100644 index 0000000..e41dfcf --- /dev/null +++ b/src/main/java/org/at4j/comp/bzip2/BZip2EncoderExecutorServiceImpl.java @@ -0,0 +1,86 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.comp.bzip2; + +import java.util.concurrent.ArrayBlockingQueue; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.RejectedExecutionHandler; +import java.util.concurrent.ThreadPoolExecutor; +import java.util.concurrent.TimeUnit; + +/** + * This is the only implementation of {@link BZip2EncoderExecutorService}. All + * objects that are using that interface assume that it is implemented by this + * class. + * @author Karl Gustafsson + * @since 1.1 + */ +final class BZip2EncoderExecutorServiceImpl implements BZip2EncoderExecutorService +{ + /** + * This rejected execution handler shoehorns in a job in an + * {@link ExecutorService}'s job queue if it is rejected by the service. + * This requires that the service's job queue has an upper bound and that it + * blocks when trying to insert more elements than the bound. + * @author Karl Gustafsson + * @since 1.1 + */ + private static class ShoehornInJobRejectedExecutionHandler implements RejectedExecutionHandler + { + private static final ShoehornInJobRejectedExecutionHandler INSTANCE = new ShoehornInJobRejectedExecutionHandler(); + + public void rejectedExecution(Runnable r, ThreadPoolExecutor executor) + { + // System.out.print("Shoehorning... "); + try + { + executor.getQueue().put(r); + } + catch (InterruptedException e) + { + throw new RuntimeException(e); + } + // System.out.println("done"); + } + } + + private final ThreadPoolExecutor m_executor; + private final ErrorState m_errorState; + + BZip2EncoderExecutorServiceImpl(int noThreads, ErrorState es) + { + m_executor = new ThreadPoolExecutor(noThreads, noThreads, 100, TimeUnit.SECONDS, new ArrayBlockingQueue(1), new EncodingThreadFactory(es), ShoehornInJobRejectedExecutionHandler.INSTANCE); + m_errorState = es; + } + + ErrorState getErrorState() + { + return m_errorState; + } + + void execute(BlockEncoderRunnable r) + { + m_executor.execute(r); + } + + public void shutdown() + { + m_executor.shutdown(); + } +} diff --git a/src/main/java/org/at4j/comp/bzip2/BZip2OutputStream.java b/src/main/java/org/at4j/comp/bzip2/BZip2OutputStream.java new file mode 100644 index 0000000..de5f5dc --- /dev/null +++ b/src/main/java/org/at4j/comp/bzip2/BZip2OutputStream.java @@ -0,0 +1,306 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.comp.bzip2; + +import java.io.IOException; +import java.io.OutputStream; +import java.util.concurrent.atomic.AtomicInteger; + +import org.at4j.support.io.LittleEndianBitOutputStream; + +/** + * This is an {@link OutputStream} for bzip2 compressing data. + *

+ * For more information on the inner workings of bzip2, see the Wikipedia article on bzip2. + *

+ * This stream is not safe for concurrent access by several writing + * threads. A client must provide external synchronization to use this from + * several threads. + * @author Karl Gustafsson + * @since 1.1 + * @see BZip2OutputStreamSettings + */ +public class BZip2OutputStream extends OutputStream +{ + private static final byte[] EOS_MAGIC = new byte[] { 0x17, 0x72, 0x45, 0x38, 0x50, (byte) 0x90 }; + + // This is used to generate unique hash codes for each created stream + // object. + private static final AtomicInteger HASH_CODE_GENERATOR = new AtomicInteger(0); + + private final LittleEndianBitOutputStream m_wrapped; + // The block size in bytes + private final int m_blockSize; + // This may be null + + // Data stream that writes to the block currently being filled with data. + private final BlockOutputStream m_blockOutputStream; + // If several threads are used to encode the data, this is used to write the + // encoded blocks in the right order. + private final EncodedBlockWriter m_encodedBlockWriter; + private final BZip2EncoderExecutorServiceImpl m_executorService; + private final boolean m_iCreatedExecutor; + private final int m_hashCode = HASH_CODE_GENERATOR.getAndIncrement(); + + private boolean m_closed; + private long m_pos = 0; + + private static void writeFileHeader(OutputStream os, int blockSize) throws IOException + { + // File header + os.write('B'); + os.write('Z'); + // File version + os.write('h'); + // Block size as a character. The ASCII code for 0 is 48. + os.write(blockSize + 48); + } + + /** + * Create a new bzip2 compressing output stream with default settings. + * @param wrapped Compressed data is written to this stream. + * @throws IOException On errors writing the file header. + * @see #BZip2OutputStream(OutputStream, BZip2OutputStreamSettings) + */ + public BZip2OutputStream(OutputStream wrapped) throws IOException + { + this(wrapped, new BZip2OutputStreamSettings()); + } + + /** + * Create a new bzip2 compressing output stream. + * @param wrapped Compressed data is written to this stream. + * @param settings Compression settings. + * @throws IOException On errors writing the file header. + * @see #BZip2OutputStream(OutputStream) + */ + public BZip2OutputStream(OutputStream wrapped, BZip2OutputStreamSettings settings) throws IOException + { + // Null checks + wrapped.getClass(); + settings.getClass(); + + m_wrapped = new LittleEndianBitOutputStream(wrapped); + // bzip2 uses 1kb == 1000b + m_blockSize = settings.getBlockSize() * 100 * 1000; + + writeFileHeader(wrapped, settings.getBlockSize()); + + EncodingScratchpad sp; + if (settings.getExecutorService() != null) + { + // Use the supplied executor service + // There is only one allowed implementation for now. + m_executorService = (BZip2EncoderExecutorServiceImpl) settings.getExecutorService(); + m_iCreatedExecutor = false; + m_encodedBlockWriter = new EncodedBlockWriter(m_wrapped); + // Each encoder thread has its own scratchpad + sp = null; + } + else if (settings.getNumberOfEncoderThreads() > 0) + { + // Use separate encoder threads. + m_executorService = new BZip2EncoderExecutorServiceImpl(settings.getNumberOfEncoderThreads(), new SingleObserverErrorState()); + m_iCreatedExecutor = true; + m_encodedBlockWriter = new EncodedBlockWriter(m_wrapped); + // Each encoder thread has its own scratchpad + sp = null; + } + else + { + // Encode in the thread writing to the stream. + m_executorService = null; + m_iCreatedExecutor = false; + sp = new EncodingScratchpad(); + m_encodedBlockWriter = null; + } + + m_blockOutputStream = new BlockOutputStream(m_wrapped, m_blockSize, settings.getNumberOfHuffmanTreeRefinementIterations() , m_executorService, this, m_encodedBlockWriter, sp); + } + + private void assertNotClosed() throws IOException + { + if (m_closed) + { + throw new IOException("This stream is closed"); + } + } + + private void checkErrorState() throws IOException, RuntimeException + { + if (m_executorService != null) + { + m_executorService.getErrorState().checkAndClearErrors(this); + } + } + + private void debug(String msg) + { + + } + + private void writeEosBlock() throws IOException + { + // Write the end of stream magic + for (int i = 0; i < EOS_MAGIC.length; i++) + { + m_wrapped.writeBitsLittleEndian(EOS_MAGIC[i] & 0xFF, 8); + } + // Write file checksum + m_wrapped.writeBitsLittleEndian(m_blockOutputStream.getFileChecksum(), 32); + m_wrapped.padToByteBoundary(); + } + + @Override + public void write(int b) throws IOException + { + assertNotClosed(); + checkErrorState(); + + m_pos++; + m_blockOutputStream.write(b & 0xFF); + } + + @Override + public void write(byte[] data) throws IOException + { + assertNotClosed(); + checkErrorState(); + + m_pos += data.length; + m_blockOutputStream.write(data); + } + + @Override + public void write(byte[] data, int offset, int len) throws IOException, IndexOutOfBoundsException + { + assertNotClosed(); + checkErrorState(); + + if (offset < 0) + { + throw new IndexOutOfBoundsException("Offset: " + offset); + } + if (len < 0) + { + throw new IndexOutOfBoundsException("Length: " + len); + } + if (offset + len > data.length) + { + throw new IndexOutOfBoundsException("Offset: " + offset + " + Length: " + len + " > length of data: " + data.length); + } + + m_pos += len; + m_blockOutputStream.write(data, offset, len); + } + + @Override + public void close() throws IOException + { + checkErrorState(); + + if (!m_closed) + { + // This writes out any remaining run length encoding data and closes + // the block output stream. + m_blockOutputStream.close(); + + if ((m_pos > 0) && (m_encodedBlockWriter != null)) + { + // Wait for all blocks to be written. + try + { + m_encodedBlockWriter.waitFor(); + } + catch (InterruptedException e) + { + // Repackage + throw new IOException("Interrupted. The output file is most likely corrupted."); + } + checkErrorState(); + } + + writeEosBlock(); + + m_wrapped.close(); + + debug("Original size: " + m_pos + ", compressed size: " + m_wrapped.getNumberOfBytesWritten()); + + if (m_iCreatedExecutor && (m_executorService != null)) + { + m_executorService.shutdown(); + } + m_closed = true; + super.close(); + } + } + + @Override + public int hashCode() + { + return m_hashCode; + } + + @Override + public boolean equals(Object o) + { + return this == o; + } + + /** + * Close the stream if the client has been sloppy about it. + */ + @Override + protected void finalize() throws Throwable + { + close(); + super.finalize(); + } + + /** + * Create a {@link BZip2EncoderExecutorService} that can be shared between + * several {@link BZip2OutputStream}:s to spread the bzip2 encoding work + * over several threads. The created executor service can be passed to the + * {@link BZip2OutputStream} constructor in a + * {@link BZip2OutputStreamSettings} object. + * @param noThreads The number of threads available to the executor. + * @return The executor service. + */ + public static BZip2EncoderExecutorService createExecutorService(int noThreads) + { + return new BZip2EncoderExecutorServiceImpl(noThreads, new MultipleObserverErrorState()); + } + + /** + * Create a {@link BZip2EncoderExecutorService} that can be shared between + * several {@link BZip2OutputStream}:s to spread the bzip2 encoding work + * over several threads. The created executor service can be passed to the + * {@link BZip2OutputStream} constructor in a + * {@link BZip2OutputStreamSettings} object. + *

+ * The created executor will have as many threads available to it as there + * are CPU:s available to the JVM. + * @return The executor service. + */ + public static BZip2EncoderExecutorService createExecutorService() + { + return createExecutorService(Runtime.getRuntime().availableProcessors()); + } +} diff --git a/src/main/java/org/at4j/comp/bzip2/BZip2OutputStreamSettings.java b/src/main/java/org/at4j/comp/bzip2/BZip2OutputStreamSettings.java new file mode 100644 index 0000000..fd35e27 --- /dev/null +++ b/src/main/java/org/at4j/comp/bzip2/BZip2OutputStreamSettings.java @@ -0,0 +1,223 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.comp.bzip2; + +import org.at4j.support.lang.At4JException; + +/** + * This object contains settings for the {@link BZip2OutputStream}. + *

+ * When created, this object contains the default settings. Modify the settings + * by calling setter methods on this object. + * @author Karl Gustafsson + * @since 1.1 + * @see BZip2OutputStream + */ +public class BZip2OutputStreamSettings implements Cloneable +{ + /** + * The minimum size of an encoded data block in hundreds of kilobytes. Using + * a small block size gives faster but worse compression. + */ + public static final int MIN_BLOCK_SIZE = 1; + + /** + * The maximum size of an encoded data block in hundreds of kilobytes. Using + * a large block size gives slower but better compression. + */ + public static final int MAX_BLOCK_SIZE = 9; + + /** + * The default block size. + */ + public static final int DEFAULT_BLOCK_SIZE = MAX_BLOCK_SIZE; + + /** + * The default number of Huffman tree refinement iterations. By having more + * tree refinement iterations the compression gets better, but as the number + * is increased the returns are diminishing. + */ + public static final int DEFAULT_NO_OF_HUFFMAN_TREE_REFINEMENT_ITERATIONS = 5; + + /** + * The default number of encoder threads. + */ + public static final int DEFAULT_NO_OF_ENCODER_THREADS = 0; + + private int m_blockSize = DEFAULT_BLOCK_SIZE; + private int m_numberOfHuffmanTreeRefinementIterations = DEFAULT_NO_OF_HUFFMAN_TREE_REFINEMENT_ITERATIONS; + private int m_numberOfEncoderThreads = DEFAULT_NO_OF_ENCODER_THREADS; + private BZip2EncoderExecutorService m_executorService; + + /** + * Set the size of compressed data blocks. A high block size gives good but + * slow compression. A low block size gives worse but faster compression. + *

+ * The default block size is 9 (the highest permitted value). + * @param bs The block size in hundreds of kilobytes. This should be between + * 1 and 9 (inclusive). + * @return {@code this} + * @throws IllegalArgumentException If the block size is not in the + * permitted range. + */ + public BZip2OutputStreamSettings setBlockSize(int bs) throws IllegalArgumentException + { + if (bs < MIN_BLOCK_SIZE || bs > MAX_BLOCK_SIZE) + { + throw new IllegalArgumentException("Invalid block size " + bs + ". It must be between " + MIN_BLOCK_SIZE + " and " + MAX_BLOCK_SIZE + " (inclusive)"); + } + m_blockSize = bs; + return this; + } + + /** + * Get the block size for a compressed data block. + * @return The block size for a compressed data block. + */ + public int getBlockSize() + { + return m_blockSize; + } + + /** + * Set the number of tree refinement iterations that are run when creating + * Huffman trees for each compressed data block. + *

+ * A higher value for this parameter should give better but slower + * compression. As the value increases the returns are diminishing. + *

+ * The default value is five refinement iterations. + * @param no The number of Huffman tree refinement iterations. This should + * be a positive integer larger than zero. + * @return {@code this} + * @throws IllegalArgumentException If the number is not a positive integer + * larger than zero. + */ + public BZip2OutputStreamSettings setNumberOfHuffmanTreeRefinementIterations(int no) throws IllegalArgumentException + { + if (no < 1) + { + throw new IllegalArgumentException("Invalid value " + no + ". It must be greater than zero"); + } + m_numberOfHuffmanTreeRefinementIterations = no; + return this; + } + + /** + * Get the number of Huffman tree refinement iterations. + * @return The number of Huffman tree refinement iterations. + */ + public int getNumberOfHuffmanTreeRefinementIterations() + { + return m_numberOfHuffmanTreeRefinementIterations; + } + + /** + * Set a for logging diagnostic output to. Output is + * logged to the debug and trace levels. + *

+ * By default no log adapter is used and hence no diagnostic output is + * logged. + * @param la A log adapter. + * @return {@code this} + */ + public BZip2OutputStreamSettings setLogAdapter(Object la) + { + return this; + } + + + /** + * Set the number of encoder threads used for bzip2 compressing data. bzip2 + * encoding is CPU intensive and giving the encoder more threads to work + * with can drastically shorten the encoding time. The drawback is that the + * memory consumption grows since each encoder thread must keep its data in + * memory. + *

+ * The default number of encoder threads is zero, which means that the + * thread that is writing the data to the {@link BZip2OutputStream} will be + * used for the encoding. + *

+ * For the shortest encoding time, use as many threads as there are + * available CPU:s in the system. + * @param no The number of encoder threads to use. If this is set to {@code + * 0}, the encoding will be done in the thread writing to the stream. + * @return {@code this} + * @throws IllegalArgumentException If {@code no} is negative. + * @see #setExecutorService(BZip2EncoderExecutorService) + */ + public BZip2OutputStreamSettings setNumberOfEncoderThreads(int no) throws IllegalArgumentException + { + if (no < 0) + { + throw new IllegalArgumentException("Invalid number of encoder threads " + no + ". The number must be zero or greater"); + } + + m_numberOfEncoderThreads = no; + return this; + } + + public int getNumberOfEncoderThreads() + { + return m_numberOfEncoderThreads; + } + + /** + * Set an executor service that the {@link BZip2OutputStream} will use to + * spread the encoding over several threads. This executor can be shared + * among several {@link BZip2OutputStream} objects. + *

+ * If an executor service is set using this method, all threads that are + * available to the executor is used for the encoding and any value set + * using {@link #setNumberOfEncoderThreads(int)} is ignored. + *

+ * An executor service is created using the + * {@link BZip2OutputStream#createExecutorService()} or the + * {@link BZip2OutputStream#createExecutorService(int)} method. + * @param executorService The executor service. + * @return {@code this} + * @see #setNumberOfEncoderThreads(int) + */ + public BZip2OutputStreamSettings setExecutorService(BZip2EncoderExecutorService executorService) + { + m_executorService = executorService; + return this; + } + + public BZip2EncoderExecutorService getExecutorService() + { + return m_executorService; + } + + /** + * Make a copy of this object. + */ + @Override + public BZip2OutputStreamSettings clone() + { + try + { + return (BZip2OutputStreamSettings) super.clone(); + } + catch (CloneNotSupportedException e) + { + throw new At4JException("Bug", e); + } + } +} diff --git a/src/main/java/org/at4j/comp/bzip2/Block.java b/src/main/java/org/at4j/comp/bzip2/Block.java new file mode 100644 index 0000000..928cd7a --- /dev/null +++ b/src/main/java/org/at4j/comp/bzip2/Block.java @@ -0,0 +1,29 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.comp.bzip2; + +/** + * Interface identifying a bzip2 data block. Used by the {@link BlockDecoder}. + * @author Karl Gustafsson + * @since 1.1 + */ +interface Block +{ + // Nothing +} diff --git a/src/main/java/org/at4j/comp/bzip2/BlockDecoder.java b/src/main/java/org/at4j/comp/bzip2/BlockDecoder.java new file mode 100644 index 0000000..adb741a --- /dev/null +++ b/src/main/java/org/at4j/comp/bzip2/BlockDecoder.java @@ -0,0 +1,422 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.comp.bzip2; + +import java.io.IOException; +import java.util.Arrays; + +import org.at4j.support.comp.ByteMoveToFront; +import org.at4j.support.comp.IntMoveToFront; +import org.at4j.support.io.LittleEndianBitInputStream; +import org.at4j.support.lang.At4JException; +import org.at4j.support.lang.UnsignedInteger; + +/** + * This is used by the {@link BZip2InputStream} to decode data blocks. + * @author Karl Gustafsson + * @since 1.1 + */ +final class BlockDecoder +{ + // The magic number identifying a block of compressed data + private static final byte[] COMPRESSED_BLOCK_MAGIC = new byte[] { (byte) 0x31, (byte) 0x41, (byte) 0x59, (byte) 0x26, (byte) 0x53, (byte) 0x59 }; + // The magic number identifying the end of stream block + private static final byte[] EOS_BLOCK_MAGIC = new byte[] { (byte) 0x17, (byte) 0x72, (byte) 0x45, (byte) 0x38, (byte) 0x50, (byte) 0x90 }; + + // The number of symbols to read from each Huffman tree before switching + private static final int SYMBOLS_TO_READ_FROM_EACH_TREE = 50; + + // The symbol value of the special RUNA symbol. + private static final int RUNA_SYMBOL = 0; + // The symbol value of the special RUNB symbol. + private static final int RUNB_SYMBOL = 1; + + private static final int MAX_NO_OF_MTF_SYMBOLS = 258; + + private static final byte[] INITIAL_MOVE_TO_FRONT_ALPHABET = new byte[MAX_NO_OF_MTF_SYMBOLS]; + static + { + for (int i = 0; i < MAX_NO_OF_MTF_SYMBOLS; i++) + { + INITIAL_MOVE_TO_FRONT_ALPHABET[i] = (byte) i; + } + } + + private final LittleEndianBitInputStream m_in; + private final int m_blockSize; + + // Data read from the block header + + // Block checksum (CRC) + private int m_readBlockChecksum; + // The pointer to the original data used in the BW transform + private int m_originalDataPointer; + // The Huffman trees used for decompression + private HighValueBranchHuffmanTree[] m_huffmanTrees; + // The EOB (End Of Block) symbol index. + private int m_endOfBlockSymbol; + // The number of times that the Huffman trees are switched in the input. + // The trees are switched every 50 bytes. + private int m_numberOfTimesHuffmanTreesAreSwitched; + private int[] m_treeUse; + // Mapping between symbol values and byte values. + private byte[] m_symbolSequenceNos; + // Frequency of each byte in the pre-BW data + private int[] m_byteFrequencies; + + // State variables + + // The number of the currently selected Huffman tree + private HighValueBranchHuffmanTree m_curTree; + // The number of symbols left to read from the current Huffman tree + private int m_symbolsLeftToReadFromCurTree; + // The current number of Huffman tree switches + private int m_switchNo; + // A counter for the number of bytes decoded in this block. + private int m_noBytesDecoded; + private ByteMoveToFront m_mtfTransformer; + // This will hold the decoded data (before the Burrows Wheeler decoding) + private final byte[] m_decoded; + + BlockDecoder(LittleEndianBitInputStream in, int blockSize) + { + m_in = in; + m_blockSize = blockSize; + m_decoded = new byte[blockSize]; + } + + private void throwIOException(String msg) throws IOException + { + throw new IOException(msg + ". Position in input stream: " + m_in.getNumberOfBytesRead()); + } + + private void checkInterrupted() throws InterruptedException + { + if (Thread.interrupted()) + { + throw new InterruptedException(); + } + } + + private void trace(String s) + { + System.out.println(s); + } + + static HighValueBranchHuffmanTree decodeHuffmanTree(final int totalNumberOfSymbols, final LittleEndianBitInputStream in) throws IOException + { + int[] symbolLengths = new int[totalNumberOfSymbols]; + + // Starting bit length for Huffman deltas in this tree + int currentBitLength = in.readBits(5); + if (currentBitLength > 20) + { + throw new IOException("Invalid starting bit length for Huffman deltas: " + currentBitLength + ". Must be <= 20"); + } + + // Initialize min and max lengths per tree with values that + // will certainly be overwritten. + int minBitLengthPerTree = 20; + int maxBitLengthPerTree = 0; + + for (int j = 0; j < totalNumberOfSymbols; j++) + { + while (in.readBit()) + { + currentBitLength += in.readBit() ? -1 : 1; + if ((currentBitLength < 1) || (currentBitLength > 20)) + { + throw new IOException("Invalid bit length " + currentBitLength); + } + } + symbolLengths[j] = currentBitLength; + + if (currentBitLength < minBitLengthPerTree) + { + minBitLengthPerTree = currentBitLength; + } + if (currentBitLength > maxBitLengthPerTree) + { + maxBitLengthPerTree = currentBitLength; + } + } + return new HighValueBranchHuffmanTree(symbolLengths, minBitLengthPerTree, maxBitLengthPerTree, false); + } + + private void readCompressedBlockHeader() throws IOException + { + byte[] barr = new byte[4]; + + // Block checksum + m_readBlockChecksum = (int) UnsignedInteger.fromLittleEndianByteArrayToLong(m_in.readBytes(barr, 0, 4), 0); + + // Randomized block? + if (m_in.readBit()) + { + throwIOException("Randomized block mode is not supported"); + } + + // Starting pointer into BWT + m_in.readBytes(barr, 1, 3); + barr[0] = 0; + m_originalDataPointer = (int) UnsignedInteger.fromLittleEndianByteArrayToLong(barr, 0); + if (m_originalDataPointer > m_blockSize) + { + throw new IOException("Invalid starting pointer " + m_originalDataPointer + ". It must be less than the block size " + m_blockSize); + } + + // Huffman used codes + boolean[] usedSymbols = new boolean[256]; + int numberOfUsedSymbols = 0; + + boolean[] inUseBlocks = new boolean[16]; + for (int i = 0; i < 16; i++) + { + inUseBlocks[i] = m_in.readBit(); + } + for (int i = 0; i < 16; i++) + { + if (inUseBlocks[i]) + { + for (int j = 0; j < 16; j++) + { + if (m_in.readBit()) + { + usedSymbols[i * 16 + j] = true; + numberOfUsedSymbols++; + } + } + } + } + if (numberOfUsedSymbols == 0) + { + throwIOException("No symbols used in table"); + } + + // Create a mapping for the sequence numbers of all used bytes + m_symbolSequenceNos = new byte[numberOfUsedSymbols]; + int useIndex = 0; + for (int i = 0; i < 256; i++) + { + if (usedSymbols[i]) + { + m_symbolSequenceNos[useIndex++] = (byte) (i & 0xFF); + } + } + assert useIndex == numberOfUsedSymbols; + + m_byteFrequencies = new int[256]; + + // The number of Huffman trees to use + int numberOfHuffmanTrees = m_in.readBits(3); + if (numberOfHuffmanTrees < 2 || numberOfHuffmanTrees > 6) + { + throwIOException("Invalid number of Huffman trees " + numberOfHuffmanTrees + ". Must be between 2 and 6 (inclusive)"); + } + + // The number of times the trees to use are swapped in the input. + // The trees are swapped each 50 bytes. + m_numberOfTimesHuffmanTreesAreSwitched = m_in.readBitsLittleEndian(15); + if (m_numberOfTimesHuffmanTreesAreSwitched < 1) + { + throwIOException("Invalid number of times the Huffman trees are switched in the input: " + m_numberOfTimesHuffmanTreesAreSwitched); + } + + // Zero-terminated bit runs for each tree switch + int[] treeUseMtf = new int[m_numberOfTimesHuffmanTreesAreSwitched]; + for (int i = 0; i < m_numberOfTimesHuffmanTreesAreSwitched; i++) + { + treeUseMtf[i] = 0; + while (m_in.readBit()) + { + treeUseMtf[i]++; + } + if (treeUseMtf[i] > numberOfHuffmanTrees) + { + throwIOException("Invalid Huffman tree use MTF " + treeUseMtf[i] + ". Must be less than the number of Huffman trees, " + numberOfHuffmanTrees); + } + } + + // Decode the tree use MTF values + m_treeUse = new int[m_numberOfTimesHuffmanTreesAreSwitched]; + // The "alphabet" for the MTF encoding -- the indices of the different + // tree uses. + int[] treeUseIndices = new int[numberOfHuffmanTrees]; + for (int i = 0; i < numberOfHuffmanTrees; i++) + { + treeUseIndices[i] = i; + } + new IntMoveToFront(treeUseIndices).decode(treeUseMtf, m_treeUse); + + // Settings for the Huffman trees + + // The total number of used symbols is the value we calculated above - 1 + // + RUNA, RUNB and an end of stream marker. + int totalNumberOfSymbols = numberOfUsedSymbols + 2; + m_huffmanTrees = new HighValueBranchHuffmanTree[numberOfHuffmanTrees]; + for (int i = 0; i < numberOfHuffmanTrees; i++) + { + m_huffmanTrees[i] = decodeHuffmanTree(totalNumberOfSymbols, m_in); + } + + // The symbol value for the end of the data block. + m_endOfBlockSymbol = totalNumberOfSymbols - 1; + } + + private void selectNewHuffmanTree() throws IOException + { + if (m_switchNo >= m_numberOfTimesHuffmanTreesAreSwitched) + { + throwIOException("One Huffman tree switch too many: " + m_switchNo); + } + m_symbolsLeftToReadFromCurTree = SYMBOLS_TO_READ_FROM_EACH_TREE; + m_curTree = m_huffmanTrees[m_treeUse[m_switchNo]]; + m_switchNo++; + } + + private int readSymbol() throws IOException + { + if (m_symbolsLeftToReadFromCurTree == 0) + { + selectNewHuffmanTree(); + } + final int symbol = m_curTree.readNext(m_in); + m_symbolsLeftToReadFromCurTree--; + return symbol; + } + + private void decodeSingleByte(final int symbolMtf) throws IOException + { + // Move To Front decode the symbol + final int byteIndex = m_mtfTransformer.decode(symbolMtf - 1) & 0xFF; + + final byte value = m_symbolSequenceNos[byteIndex]; + m_decoded[m_noBytesDecoded++] = value; + m_byteFrequencies[value & 0xFF]++; + } + + // returns the next symbol + private int handleRunaAndRunb(int symbol) throws IOException + { + int n = 1; + int multiplier = 0; + while (symbol == RUNA_SYMBOL || symbol == RUNB_SYMBOL) + { + if (symbol == RUNA_SYMBOL) + { + multiplier += n; + } + else + { + multiplier += 2 * n; + } + // Multiply n with 2 + n <<= 1; + symbol = readSymbol(); + } + + // The repeated value is at the front of the MTF list + final int byteIndex = m_mtfTransformer.decode(0) & 0xFF; + final byte value = m_symbolSequenceNos[byteIndex]; + if (multiplier == 1) + { + m_decoded[m_noBytesDecoded++] = value; + m_byteFrequencies[value & 0xFF]++; + } + else + { + Arrays.fill(m_decoded, m_noBytesDecoded, m_noBytesDecoded + multiplier, value); + m_noBytesDecoded += multiplier; + m_byteFrequencies[value & 0xFF] += multiplier; + } + return symbol; + } + + CompressedDataBlock readCompressedDataBlock() throws IOException, InterruptedException + { + readCompressedBlockHeader(); + + int symbol = readSymbol(); + + while (true) + { + checkInterrupted(); + + if (symbol == RUNA_SYMBOL || symbol == RUNB_SYMBOL) + { + symbol = handleRunaAndRunb(symbol); + } + else if (symbol == m_endOfBlockSymbol) + { + BurrowsWheelerDecoder bwd = new BurrowsWheelerDecoder(m_decoded, m_noBytesDecoded, m_byteFrequencies, m_originalDataPointer); + return new CompressedDataBlock(new RLEDecodingInputStream(bwd.decode(), m_readBlockChecksum), m_readBlockChecksum); + } + else + { + decodeSingleByte(symbol); + symbol = readSymbol(); + } + } + } + + private void initDecoderState() + { + // Initialize the MTF alphabet + final byte[] moveToFrontAlphabet = new byte[MAX_NO_OF_MTF_SYMBOLS]; + System.arraycopy(INITIAL_MOVE_TO_FRONT_ALPHABET, 0, moveToFrontAlphabet, 0, MAX_NO_OF_MTF_SYMBOLS); + m_mtfTransformer = new ByteMoveToFront(moveToFrontAlphabet); + m_curTree = null; + m_symbolsLeftToReadFromCurTree = 0; + m_switchNo = 0; + m_noBytesDecoded = 0; + } + + Block getNextBlock() throws IOException + { + initDecoderState(); + + byte[] barr = new byte[6]; + m_in.readBytes(barr, 0, 6); + if (Arrays.equals(COMPRESSED_BLOCK_MAGIC, barr)) + { + trace("Found block of compressed data"); + try + { + return readCompressedDataBlock(); + } + catch (InterruptedException e) + { + throw new At4JException(e); + } + } + else if (Arrays.equals(EOS_BLOCK_MAGIC, barr)) + { + trace("Found end of stream block"); + m_in.readBytes(barr, 0, 4); + int readCrc32 = (int) UnsignedInteger.fromLittleEndianByteArrayToLong(barr, 0); + return new EosBlock(readCrc32); + } + else + { + throwIOException("Invalid block header " + Arrays.toString(barr) + ". Expected compressed data block or end of stream block"); + // Never reached + return null; + } + } +} diff --git a/src/main/java/org/at4j/comp/bzip2/BlockEncodedCallback.java b/src/main/java/org/at4j/comp/bzip2/BlockEncodedCallback.java new file mode 100644 index 0000000..c1db007 --- /dev/null +++ b/src/main/java/org/at4j/comp/bzip2/BlockEncodedCallback.java @@ -0,0 +1,54 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.comp.bzip2; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; + +import org.at4j.support.io.BitOutput; + +/** + * This callback is called by the {@link BlockEncoder} when it has encoded its + * block. + * @author Karl Gustafsson + * @since 1.1 + */ +final class BlockEncodedCallback +{ + private final int m_blockNo; + private final EncodedBlockWriter m_writer; + private final ByteArrayOutputStream m_byteOut; + private final BitOutput m_bitOut; + + BlockEncodedCallback(final int blockNo, final ByteArrayOutputStream byteOut, final BitOutput bitOut, final EncodedBlockWriter writer) + { + m_blockNo = blockNo; + m_writer = writer; + m_byteOut = byteOut; + m_bitOut = bitOut; + } + + /** + * This is called by the {@link BlockEncoder} when it is done. + */ + void reportBlockDone() throws IOException + { + m_writer.writeBlock(m_blockNo, new EncodedBlockData(m_byteOut.toByteArray(), m_bitOut.getNumberOfBitsInUnfinishedByte(), m_bitOut.getUnfinishedByte())); + } +} diff --git a/src/main/java/org/at4j/comp/bzip2/BlockEncoder.java b/src/main/java/org/at4j/comp/bzip2/BlockEncoder.java new file mode 100644 index 0000000..e7b04ce --- /dev/null +++ b/src/main/java/org/at4j/comp/bzip2/BlockEncoder.java @@ -0,0 +1,893 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.comp.bzip2; + +import java.io.IOException; +import java.util.Arrays; + +import org.at4j.comp.bzip2.BurrowsWheelerEncoder.BurrowsWheelerEncodingResult; +import org.at4j.support.comp.IntMoveToFront; +import org.at4j.support.io.BitOutput; + +/** + * This is used by the thread encoding a bzip2 block. + * @author Karl Gustafsson + * @since 1.1 + */ +final class BlockEncoder +{ + private static final byte[] BLOCK_MAGIC = new byte[] { 0x31, 0x41, 0x59, 0x26, 0x53, 0x59 }; + + // The maximum Huffman tree depth + private static final int MAX_HUFFMAN_BIT_LENGTH = 17; + + // The values of the RUNA and RUNB symbols + private static final int RUNA_SYMBOL = 0; + private static final int RUNB_SYMBOL = 1; + + private static final int MIN_NO_OF_HUFFMAN_TREES = 2; + static final int MAX_NO_OF_HUFFMAN_TREES = 6; + + // The maximum number of different MTF symbols: 256 bytes + RUNA + RUNB + + // EOB - one byte (the first symbol does not have to be encoded thanks to + // MTF and RLE) + static final int MAX_NO_OF_MTF_SYMBOLS = 258; + + // Write 50 symbols, then swap Huffman trees. + static final int NO_OF_SYMBOLS_PER_SEGMENT = 50; + + // Categories used when optimizing Huffman trees + // For each tree length, in which category does a segment belong depending + // on its encoded length percentage? + static final int[][] CATEGORY_PER_NO_OF_TREES_AND_PERCENTAGE = new int[][] { + // Two trees: cutoff at 30% + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, + // Three trees: cutoff at 18% and 45% + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 }, + // Four trees: cutoff at 15%, 30% and 55% + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }, + // Five trees: cutoff at 12%, 25%, 40% and 60% + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }, + // Six trees: cutoff at 8%, 25%, 36%, 51% and 63% + { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 } }; + + private static final byte[] INITIAL_MTF_ALPHABET = new byte[MAX_NO_OF_MTF_SYMBOLS]; + static + { + for (int i = 0; i < INITIAL_MTF_ALPHABET.length; i++) + { + INITIAL_MTF_ALPHABET[i] = (byte) (i & 0xFF); + } + } + + private final byte[] m_block; + private final int m_blockNo; + private final int m_blockSize; + private final int m_blockChecksum; + // Bit flags indicating which bytes that occur at least once in this block + private final boolean[] m_seenDifferentBytes; + // The number of different bytes seen in this block + private final int m_numberOfSeenDifferentBytes; + private final int m_numberOfHuffmanTreeRefinementIterations; + // Sink to write encoded data to. + private final BitOutput m_out; + // This callback is called when the block encoder is done. It may be null. + private final BlockEncodedCallback m_blockEncoderCallback; + + // This is set by the encoding thread before calling encode + private EncodingScratchpad m_scratchpad; + + BlockEncoder(final byte[] block, final int blockNo, final int blockSize, final int blockChecksum, final boolean[] seenDifferentBytes, final int numberOfSeenDifferentBytes, final int numberOfHuffmanTreeRefinementIterations, + final BitOutput out, final BlockEncodedCallback bec) + { + m_block = block; + m_blockNo = blockNo; + m_blockSize = blockSize; + m_blockChecksum = blockChecksum; + m_seenDifferentBytes = seenDifferentBytes; + m_numberOfSeenDifferentBytes = numberOfSeenDifferentBytes; + m_numberOfHuffmanTreeRefinementIterations = numberOfHuffmanTreeRefinementIterations; + m_out = out; + m_blockEncoderCallback = bec; + } + + void setScratchpad(EncodingScratchpad sp) + { + m_scratchpad = sp; + } + + /** + * Get the seen byte values in the current block. + */ + private byte[] getSeenByteValues() + { + byte[] res = new byte[m_numberOfSeenDifferentBytes]; + int j = 0; + for (int i = 0; i < 256; i++) + { + if (m_seenDifferentBytes[i]) + { + res[j++] = (byte) (i & 0xFF); + } + } + assert j == m_numberOfSeenDifferentBytes; + return res; + } + + /** + * Add RUNA and RUNB symbols to {@code res} at {@code outIndex} to represent + * {@code no} repetitions of the previous symbol. + *

+ * This method is declared package-protected for the unit tests. + * @return The number of symbols added. outIndex should be incremented by + * this value by the caller. + */ + static int addRunaAndRunb(int[] res, int outIndex, int no) + { + int noWritten = 0; + while (no > 0) + { + switch (no % 2) + { + case 1: + res[outIndex + noWritten++] = RUNA_SYMBOL; + no -= 1; + break; + case 0: + res[outIndex + noWritten++] = RUNB_SYMBOL; + no -= 2; + break; + default: + // Should not occur unless we use relativistic arithmetic or + // something... + throw new RuntimeException(); + } + no >>>= 1; + } + return noWritten; + } + + /** + * Create a mapping between symbols and their index numbers in the array of + * symbols. + * @param symbols The symbols. + * @return An array containing the index number for each symbol that occurs + * in {@code symbols}. + */ + private byte[] createSequenceMap(byte[] symbols) + { + byte[] res = m_scratchpad.m_sequenceMap; + byte index = 0; + for (int i = 0; i < symbols.length; i++) + { + res[symbols[i] & 0xFF] = index++; + } + return res; + } + + private static class MTFAndRLEResult + { + // The encoded data as MTF symbols. + private final int[] m_encodedData; + private final int m_dataLen; + private final int m_noSeenDifferentSymbols; + + private MTFAndRLEResult(int[] symbols, int dataLen, int noSeenDifferentSymbols) + { + m_encodedData = symbols; + m_dataLen = dataLen; + m_noSeenDifferentSymbols = noSeenDifferentSymbols; + } + } + + /** + * Run MTF and RLE encoding of the data in {@code data}. + * @param data The data to encode. + * @param dataLen The data length. + * @param symbols An array containing all different symbols that occur in + * {@code data}. + * @return MTF and RLE encoded data. + */ + private MTFAndRLEResult moveToFrontAndRunLengthEncode(final byte[] data, final int dataLen, final byte[] symbols) + { + // This array will contain the run length encoded result. The result + // will probably be shorter than data.length thanks to the run length + // encoding, but data.length (+ 1 for the EOB symbol) is the worst case + // length. + boolean[] seenSymbols = new boolean[259]; + // RUNA and RUNB are always seen (even when they are not...) + seenSymbols[0] = true; + seenSymbols[1] = true; + int noSeenSymbols = 2; + + // Initialize the move to front alphabet + final byte[] mtfAlphabet = m_scratchpad.m_mtfAlphabet; + System.arraycopy(INITIAL_MTF_ALPHABET, 0, mtfAlphabet, 0, mtfAlphabet.length); + + // The array to store the encoded data in. + final int[] encodedData = m_scratchpad.m_encodedData; + + // Create a mapping between a symbol and its index number in the array + // of symbols + final byte[] sequenceMap = createSequenceMap(symbols); + + int lastSymbolIndex = 0; + int curOutArrayIndex = 0; + // A counter to keep track of the number of equal symbols in a row for + // the run length encoding + int noSame = 0; + for (int curInArrayIndex = 0; curInArrayIndex < dataLen; curInArrayIndex++) + { + final byte curSymbolIndex = sequenceMap[data[curInArrayIndex] & 0xFF]; + if (curSymbolIndex == lastSymbolIndex) + { + noSame++; + } + else + { + if (noSame > 0) + { + // Run length encode + curOutArrayIndex += addRunaAndRunb(m_scratchpad.m_encodedData, curOutArrayIndex, noSame); + noSame = 0; + } + + // Search for the current symbol in the MTF alphabet and count + // the distance + int j = 0; + byte lastMtf = mtfAlphabet[0]; + + while (mtfAlphabet[++j] != curSymbolIndex) + { + final byte nextLastMtf = mtfAlphabet[j]; + mtfAlphabet[j] = lastMtf; + lastMtf = nextLastMtf; + } + // Swap the symbols in the MTF alphabet. + mtfAlphabet[j] = lastMtf; + mtfAlphabet[0] = curSymbolIndex; + + // Output the distance. Distance 1 gets the value 2 since + // RUNA and RUNB have the values 0 and 1. + int symbolVal = j + 1; + encodedData[curOutArrayIndex++] = symbolVal; + if (!seenSymbols[symbolVal]) + { + seenSymbols[symbolVal] = true; + noSeenSymbols++; + } + lastSymbolIndex = curSymbolIndex; + } + } + if (noSame > 0) + { + // One last run length encoding + curOutArrayIndex += addRunaAndRunb(encodedData, curOutArrayIndex, noSame); + } + return new MTFAndRLEResult(encodedData, curOutArrayIndex, noSeenSymbols); + } + + private static class EncodeAllSegmentsResult + { + // The shortest encoded segment length for all segments. + private int m_shortestLength; + // The longest encoded segment length for all segments. + private int m_longestLength; + // A list with encoding results (the bit length) for each segment and + // tree. + private int[][] m_encodingResults; + // For each segment, the index of the tree that gave the shortest + // encoded block. + private int[] m_treesUsed; + } + + /** + * Encode all 50-byte segments with all trees and count the encoded lengths. + * By doing this we can select the best Huffman tree for each segment by + * seeing which tree that gave the shortest encoded data. + * @param data The data to encode. + * @param dataLen The length of the data. (This may be shorter than the + * {@code data} array.) + * @param codeLengths An array of code lengths for each symbol for each + * investigated Huffman tree. + * @param numberOfHuffmanSegments The number of 50-byte segments in the + * current block. + * @param numberOfDifferentSymbols The number of different symbols in the + * data. This is the value of the EOB symbol + 1. + * @param res The result of the operation is stored in this object. + */ + private void encodeAllSegmentsWithAllTrees(final int[] data, final int dataLen, final int[][] codeLengths, final int numberOfHuffmanSegments, final int numberOfDifferentSymbols, final EncodeAllSegmentsResult res) throws IOException + { + final int noTrees = codeLengths.length; + final int[][] encodingResults = m_scratchpad.m_encodingResults; + // The best tree for each segment + final int[] treesUsed = new int[numberOfHuffmanSegments]; + // The shortest seen shortest length for all segments + int shortestLength = Integer.MAX_VALUE; + // The longest seen -shortest- length for all segments + int longestLength = 0; + for (int segmentNo = 0; segmentNo < numberOfHuffmanSegments; segmentNo++) + { + // Encode this segment with all Huffman trees + int shortestLengthForSegment = Integer.MAX_VALUE; + int bestTreeIndex = 0; + final int[] segmentEncodingResultPerTree = new int[noTrees]; + final int segmentStart = segmentNo * NO_OF_SYMBOLS_PER_SEGMENT; + final int segmentEnd = Math.min(segmentStart + NO_OF_SYMBOLS_PER_SEGMENT, dataLen); + for (int treeNo = 0; treeNo < noTrees; treeNo++) + { + final int[] curTreeCodeLengths = codeLengths[treeNo]; + int bitLen = 0; + for (int j = segmentStart; j < segmentEnd; j++) + { + bitLen += curTreeCodeLengths[data[j]]; + } + + if (treeNo == 0) + { + shortestLengthForSegment = bitLen; + } + else if (bitLen < shortestLengthForSegment) + { + shortestLengthForSegment = bitLen; + bestTreeIndex = treeNo; + } + segmentEncodingResultPerTree[treeNo] = bitLen; + } + + if (segmentNo == 0) + { + shortestLength = longestLength = shortestLengthForSegment; + } + // Don't count the length of the last segment since that is likely + // to contain less than 50 symbols. + else if ((segmentNo < (numberOfHuffmanSegments - 1)) && (shortestLengthForSegment < shortestLength)) + { + shortestLength = shortestLengthForSegment; + } + else if (shortestLengthForSegment > longestLength) + { + longestLength = shortestLengthForSegment; + } + encodingResults[segmentNo] = segmentEncodingResultPerTree; + treesUsed[segmentNo] = bestTreeIndex; + } + + res.m_encodingResults = encodingResults; + res.m_longestLength = longestLength; + res.m_shortestLength = shortestLength; + res.m_treesUsed = treesUsed; + } + + /** + * Divide all segments into x categories based on how well they were encoded + * by the globally optimal Huffman tree. An optimal Huffman tree is created + * for each category. + * @param data The data to encode. + * @param dataLen The length of the data. + * @param eobSymbol The value of the special EOB symbol. This is the highest + * used symbol value. + * @param numberOfHuffmanTrees The number of Huffman trees to create. + * @param numberOfSegments The number of 50-byte segments in the block. + * @param easr The encoding results from encoding the data with the globally + * optimal Huffman tree. + * @param globallyOptimalTree The symbol code lengths for the globally + * optimal Huffman tree. + * @return The symbols code lengths for each created tree. + */ + private int[][] createNewTrees(final int[] data, final int dataLen, final int eobSymbol, final int numberOfHuffmanTrees, final int numberOfSegments, final EncodeAllSegmentsResult easr, final int[] globallyOptimalTree) + { + // Clear the frequencies array + final int[][] frequencies = m_scratchpad.m_frequencies2d; + for (int i = 0; i < numberOfHuffmanTrees; i++) + { + Arrays.fill(frequencies[i], 0); + } + + // How big difference in number of bits is there between the shortest + // and the longest encoded segment? + final int maxDistance = easr.m_longestLength - easr.m_shortestLength; + if (maxDistance == 0) + { + // Nothing to do. We're as optimal as can be. + return new int[][] { globallyOptimalTree }; + } + + final int numberOfCategories = numberOfHuffmanTrees; + // Which category does each 50-byte segment fall into? + final int[] categoryPerSegment = m_scratchpad.m_categoriesPerSegment; + // How many 50-byte segments fall into each category? + final int[] noSegmentsPerCategory = new int[numberOfCategories]; + + // This array is used to determine which category a segment falls into + // based on its encoded length. + final int[] catArray = CATEGORY_PER_NO_OF_TREES_AND_PERCENTAGE[numberOfHuffmanTrees - 2]; + + // Don't include the last segment in the statistics since that is likely + // to be shorter + for (int i = 0; i < numberOfSegments - 1; i++) + { + // The shortest length for this segment. + final int segmentLen = easr.m_encodingResults[i][easr.m_treesUsed[i]]; + final int percentage = (100 * (segmentLen - easr.m_shortestLength)) / maxDistance; + assert percentage >= 0; + assert percentage <= 100; + final int catNo = catArray[percentage]; + noSegmentsPerCategory[catNo]++; + categoryPerSegment[i] = catNo; + } + + for (int i = 0; i < numberOfSegments; i++) + { + final int segmentStart = i * NO_OF_SYMBOLS_PER_SEGMENT; + final int segmentEnd = Math.min(segmentStart + NO_OF_SYMBOLS_PER_SEGMENT, dataLen); + final int[] curCatFreqs = frequencies[categoryPerSegment[i]]; + for (int j = segmentStart; j < segmentEnd; j++) + { + curCatFreqs[data[j]]++; + } + } + + int noNewTrees = 0; + for (int i = 0; i < numberOfCategories; i++) + { + if (noSegmentsPerCategory[i] > 0) + { + // Create a new Huffman tree for this category. + noNewTrees++; + } + } + assert noNewTrees > 0; + + int[][] res = new int[noNewTrees][]; + int treeNo = 0; + for (int i = 0; i < numberOfCategories; i++) + { + if (noSegmentsPerCategory[i] > 0) + { + res[treeNo++] = HighValueBranchHuffmanTree.createCodeLengths(frequencies[i], eobSymbol + 1, MAX_HUFFMAN_BIT_LENGTH, m_scratchpad); + } + } + return res; + } + + /** + * Refine the Huffman trees based on the encoding results. For each tree, + * make it optimal based on the data in the segments that it was the best + * tree for. + * @param data The data to encode. + * @param dataLen The length of the data to encode. + * @param codeLengths The code length for each symbol for each tree. + * @param easr The results when encoding the data with this set of trees. + * @param eobSymbol The value of the EOB symbol. This is the highest symbol + * value. + * @return Symbol code lengths for the refined trees. + */ + private int[][] refineTreesBasedOnEncodingResults(final int[] data, final int dataLen, final int[][] codeLengths, final EncodeAllSegmentsResult easr, final int eobSymbol) + { + // Clear the frequencies array + final int[][] frequencies = m_scratchpad.m_frequencies2d; + for (int i = 0; i < codeLengths.length; i++) + { + Arrays.fill(frequencies[i], 0); + } + + int segmentNo = 0; + int noInSegment = 0; + int curTree = easr.m_treesUsed[segmentNo]; + for (int i = 0; i < dataLen; i++) + { + int symbolVal = data[i]; + frequencies[curTree][symbolVal]++; + if (++noInSegment == NO_OF_SYMBOLS_PER_SEGMENT) + { + segmentNo++; + // If the data length is a multiple of 50, we do a switch after + // encoding the last symbol which will make segmentNo greater + // than the index of the last element in easr.m_treesUsed. + // Thus the check below. + if (segmentNo < easr.m_treesUsed.length) + { + curTree = easr.m_treesUsed[segmentNo]; + } + noInSegment = 0; + } + } + + // Recreate the trees based on the gathered frequencies + int[][] res = new int[codeLengths.length][]; + for (int i = 0; i < codeLengths.length; i++) + { + res[i] = HighValueBranchHuffmanTree.createCodeLengths(frequencies[i], eobSymbol + 1, MAX_HUFFMAN_BIT_LENGTH, m_scratchpad); + } + return res; + } + + /** + * Get the number of Huffman trees to use based on the number of 50-byte + * segments in the data. + */ + private byte getNumberOfHuffmanTrees(int noSegments) + { + // Values from bzip2 + if (noSegments < 200) + { + return 2; + } + else if (noSegments < 600) + { + return 3; + } + else if (noSegments < 1200) + { + return 4; + } + else if (noSegments < 2400) + { + return 5; + } + else + { + return 6; + } + } + + /** + * Get the minimum and maximum code length from the array. + * @return An int array containing the minimum and the maximum code lengths, + * in that order. + */ + private int[] getMinAndMaxCodeLengths(final int[] codeLengths) + { + int minLength = codeLengths[0]; + int maxLength = codeLengths[0]; + for (int i = 1; i < codeLengths.length; i++) + { + if (codeLengths[i] < minLength) + { + minLength = codeLengths[i]; + } + else if (codeLengths[i] > maxLength) + { + maxLength = codeLengths[i]; + } + } + return new int[] { minLength, maxLength }; + } + + /** + * Create the Huffman trees that should be used for encoding the current + * block. First, an globally optimal tree is created. Then new trees are + * created from information on how well the globally optimal tree encoded + * different segments. Lastly, the created trees are optimized based on the + * data in the segments that they are used to encode. This last step is + * repeated a configurable number of times ({@code + * m_numberOfHuffmanTreeRefinementIterations}). + * @param data The data that should be encoded using the created Huffman + * trees. + * @param dataLen The length of the data, excluding the trailing EOB symbol. + * @param noSymbolsUsed The number of different symbols used in the data. + */ + private HuffmanTreesAndUsage createHuffmanTrees(final int[] data, final int dataLen, final int noSymbolsUsed) throws IOException + { + HuffmanTreesAndUsage res = new HuffmanTreesAndUsage(); + + // The maximum possible number of trees. + // +1 == EOB symbol + res.m_noHuffmanSegments = ((dataLen - 1 + 1) / NO_OF_SYMBOLS_PER_SEGMENT) + 1; + + // Create a Huffman tree for the entire input. + // Count the frequencies of the different bytes in the input. + int[] frequencies = m_scratchpad.m_frequencies; + Arrays.fill(frequencies, 0); + + // The maximum symbol value used (before the EOB symbol) is at least 1 + // (RUNB). + int maxSymbolValue = 1; + for (int j = 0; j < dataLen; j++) + { + int symbolVal = data[j]; + frequencies[symbolVal]++; + if (symbolVal > maxSymbolValue) + { + maxSymbolValue = symbolVal; + } + } + + // Now we can infer the value of the EOB (End Of Block) symbol. Add it + // to the end of the data. The data array is created so there should be + // room for it. + res.m_eobSymbol = maxSymbolValue + 1; + frequencies[res.m_eobSymbol] = 1; + data[dataLen] = res.m_eobSymbol; + final int dataLenIncEob = dataLen + 1; + + // Maybe we're already done? + if (res.m_noHuffmanSegments < MIN_NO_OF_HUFFMAN_TREES) + { + // We have to encode at least two trees anyway. + res.m_trees = new HighValueBranchHuffmanTree[MIN_NO_OF_HUFFMAN_TREES]; + int[] codeLengths = HighValueBranchHuffmanTree.createCodeLengths(frequencies, res.m_eobSymbol + 1, MAX_HUFFMAN_BIT_LENGTH, m_scratchpad); + int[] minAndMaxLength = getMinAndMaxCodeLengths(codeLengths); + HighValueBranchHuffmanTree tree = new HighValueBranchHuffmanTree(codeLengths, minAndMaxLength[0], minAndMaxLength[1], true); + for (int i = 0; i < MIN_NO_OF_HUFFMAN_TREES; i++) + { + res.m_trees[i] = tree; + } + // Use tree #0 for all segments + res.m_treeUsage = new int[res.m_noHuffmanSegments]; + } + else + { + final int[][][] huffmanCodeLengths = new int[m_numberOfHuffmanTreeRefinementIterations + 1][][]; + final int[] codeLengthsForGloballyOptimalTree = HighValueBranchHuffmanTree.createCodeLengths(frequencies, res.m_eobSymbol + 1, MAX_HUFFMAN_BIT_LENGTH, m_scratchpad); + final EncodeAllSegmentsResult easr = new EncodeAllSegmentsResult(); + encodeAllSegmentsWithAllTrees(data, dataLen, new int[][] { codeLengthsForGloballyOptimalTree }, res.m_noHuffmanSegments, res.m_eobSymbol + 1, easr); + huffmanCodeLengths[0] = createNewTrees(data, dataLen, res.m_eobSymbol, getNumberOfHuffmanTrees(res.m_noHuffmanSegments), res.m_noHuffmanSegments, easr, codeLengthsForGloballyOptimalTree); + + // Select the set of trees that gives the shortest total data length + int bestIndex = -1; + int bestLength = Integer.MAX_VALUE; + int[] bestTreeUsage = null; + for (int i = 0; i < huffmanCodeLengths.length; i++) + { + if (i > 0) + { + // Refine the trees + huffmanCodeLengths[i] = refineTreesBasedOnEncodingResults(data, dataLenIncEob, huffmanCodeLengths[i - 1], easr, res.m_eobSymbol); + } + encodeAllSegmentsWithAllTrees(data, dataLenIncEob, huffmanCodeLengths[i], res.m_noHuffmanSegments, res.m_eobSymbol + 1, easr); + + int totLen = 0; + for (int j = 0; j < easr.m_treesUsed.length; j++) + { + totLen += easr.m_encodingResults[j][easr.m_treesUsed[j]]; + } + + // Previously the length of each encoded tree was added to the + // total length. That had negligible effect on the total encoded + // length and a small impact on the performance. + if (totLen < bestLength) + { + bestIndex = i; + bestLength = totLen; + bestTreeUsage = easr.m_treesUsed; + } + } + + int noTrees = huffmanCodeLengths[bestIndex].length; + if (noTrees < MIN_NO_OF_HUFFMAN_TREES) + { + res.m_trees = new HighValueBranchHuffmanTree[MIN_NO_OF_HUFFMAN_TREES]; + int[] minAndMaxLength = getMinAndMaxCodeLengths(huffmanCodeLengths[bestIndex][0]); + for (int i = 0; i < MIN_NO_OF_HUFFMAN_TREES; i++) + { + res.m_trees[i] = new HighValueBranchHuffmanTree(huffmanCodeLengths[bestIndex][0], minAndMaxLength[0], minAndMaxLength[1], true); + } + } + else + { + res.m_trees = new HighValueBranchHuffmanTree[huffmanCodeLengths[bestIndex].length]; + for (int i = 0; i < huffmanCodeLengths[bestIndex].length; i++) + { + int[] minAndMaxLengths = getMinAndMaxCodeLengths(huffmanCodeLengths[bestIndex][i]); + res.m_trees[i] = new HighValueBranchHuffmanTree(huffmanCodeLengths[bestIndex][i], minAndMaxLengths[0], minAndMaxLengths[1], true); + } + } + res.m_treeUsage = bestTreeUsage; + } + return res; + } + + /** + * Encode the Huffman tree and write it to the output. + * @param tree The tree to encode. + * @param numberOfDifferentSymbols The number of different symbols in the + * tree. + * @param out The output to write the tree to. + */ + static void encodeHuffmanTree(final HighValueBranchHuffmanTree tree, final int numberOfDifferentSymbols, final BitOutput out) throws IOException + { + // Huffman bit length for the first symbol (0..17) + int len = tree.getBitLength(0); + out.writeBitsLittleEndian(len, 5); + // Encode a delta length compared to the previous length for each + // symbol. + for (int j = 0; j < numberOfDifferentSymbols; j++) + { + int prevLen = len; + len = tree.getBitLength(j); + while (len != prevLen) + { + // Alter length + out.writeBit(true); + if (prevLen < len) + { + // Make longer + out.writeBit(false); + prevLen++; + } + else + { + // Make shorter + out.writeBit(true); + prevLen--; + } + } + // We are at the right length + out.writeBit(false); + } + } + + /** + * Write the block header for an encoded data block. + * @param blockChecksum The block checksum. + * @param bwFirstPointer The pointer to the first element in the Burrows + * Wheeler encoded data. + * @param seenDifferentBytes Bit flags that are switched on for all bytes + * that are seen in the written data. + * @param mtfrle Results from the MTF and RLE encodings. + * @param htau The different Huffman trees and information on when they are + * used. + */ + private void writeBlockHeader(final int blockChecksum, int bwFirstPointer, boolean[] seenDifferentBytes, MTFAndRLEResult mtfrle, HuffmanTreesAndUsage htau) throws IOException + { + // Block magic + for (int i = 0; i < BLOCK_MAGIC.length; i++) + { + m_out.writeBitsLittleEndian(BLOCK_MAGIC[i] & 0xFF, 8); + } + // Checksum + m_out.writeBitsLittleEndian(blockChecksum, 32); + // Randomized? (no) + m_out.writeBit(false); + // Starting pointer into Burrows Wheeler matrix (24 bits) + m_out.writeBitsLittleEndian(bwFirstPointer, 24); + + boolean[] segmentsWithData = new boolean[16]; + boolean[][] seenData = new boolean[16][16]; + for (int i = 0; i < 256; i++) + { + if (seenDifferentBytes[i]) + { + segmentsWithData[i / 16] = true; + seenData[i / 16][i % 16] = true; + } + } + + // Write a flag for each block of 16 bytes that have at least one byte + // occurring in the encoded data. + for (int i = 0; i < 16; i++) + { + m_out.writeBit(segmentsWithData[i]); + } + // For each block used, write a flag for each of the used bytes in that + // block. + for (int i = 0; i < 16; i++) + { + if (segmentsWithData[i]) + { + for (int j = 0; j < 16; j++) + { + m_out.writeBit(seenData[i][j]); + } + } + } + + // The number of Huffman trees used (2..6) + m_out.writeBits(htau.m_trees.length, 3); + + // The number of times the Huffman trees are switched (each 50 bytes) + m_out.writeBitsLittleEndian(htau.m_noHuffmanSegments, 15); + + // Which Huffman tree is selected at each switch? Use a zero-terminated + // bit run of MTF:ed index values + + // Init the MTF alphabet + int[] mtfAlpha = new int[htau.m_trees.length]; + for (int i = 0; i < htau.m_trees.length; i++) + { + mtfAlpha[i] = i; + } + int[] treeUsageMtf = new int[htau.m_noHuffmanSegments]; + new IntMoveToFront(mtfAlpha).encode(htau.m_treeUsage, treeUsageMtf); + + for (int i = 0; i < htau.m_noHuffmanSegments; i++) + { + // A zero-terminated bit run for the values 0..5 + int val = 0; + while (val < treeUsageMtf[i]) + { + m_out.writeBit(true); + val++; + } + m_out.writeBit(false); + } + + // Encode each Huffman tree + for (int i = 0; i < htau.m_trees.length; i++) + { + encodeHuffmanTree(htau.m_trees[i], htau.m_eobSymbol + 1, m_out); + } + } + + private static class HuffmanTreesAndUsage + { + private HighValueBranchHuffmanTree[] m_trees; + private int m_noHuffmanSegments; + private int[] m_treeUsage; + private int m_eobSymbol; + } + + void encode() throws IOException + { + // Fix the block overshoot. Copy DATA_OVERSHOOT bytes to the end of the + // array. Repeat the data if the block is shorter than DATA_OVERSHOOT + // bytes. + int noCopied = 0; + while (noCopied < ThreeWayRadixQuicksort.DATA_OVERSHOOT) + { + int noToCopy = Math.min(ThreeWayRadixQuicksort.DATA_OVERSHOOT - noCopied, m_blockSize); + System.arraycopy(m_block, 0, m_block, m_blockSize + noCopied, noToCopy); + noCopied += noToCopy; + } + + // Sort the data in the block. + // data contains the written data after the initial move to front + // transformation + BurrowsWheelerEncodingResult burrWhee = new BurrowsWheelerEncoder(m_block, m_blockSize, m_scratchpad).encode(); + + // Run Move to front and run length encoding transformations on the + // Burrows Wheeler encoded data + MTFAndRLEResult rleMtfSymbols = moveToFrontAndRunLengthEncode(burrWhee.m_lastColumn, m_blockSize, getSeenByteValues()); + int[] encodedData = rleMtfSymbols.m_encodedData; + + // Create the Huffman trees. This method also infers the value of the + // EOB symbol and adds it to the end of the encodedData array. + HuffmanTreesAndUsage htau = createHuffmanTrees(rleMtfSymbols.m_encodedData, rleMtfSymbols.m_dataLen, rleMtfSymbols.m_noSeenDifferentSymbols); + + writeBlockHeader(m_blockChecksum, burrWhee.m_firstPointer, m_seenDifferentBytes, rleMtfSymbols, htau); + + // Write the Huffman encoded data. The EOB symbol is last in the data. + int swapNo = 0; + int noLeftUntilSwap = 1; + HighValueBranchHuffmanTree curTree = null; + // +1 == EOB symbol + for (int i = 0; i < rleMtfSymbols.m_dataLen + 1; i++) + { + if (--noLeftUntilSwap == 0) + { + curTree = htau.m_trees[htau.m_treeUsage[swapNo++]]; + noLeftUntilSwap = NO_OF_SYMBOLS_PER_SEGMENT; + } + curTree.write(m_out, encodedData[i]); + } + assert swapNo == htau.m_noHuffmanSegments; + + if (m_blockEncoderCallback != null) + { + m_blockEncoderCallback.reportBlockDone(); + } + } +} diff --git a/src/main/java/org/at4j/comp/bzip2/BlockEncoderRunnable.java b/src/main/java/org/at4j/comp/bzip2/BlockEncoderRunnable.java new file mode 100644 index 0000000..2488594 --- /dev/null +++ b/src/main/java/org/at4j/comp/bzip2/BlockEncoderRunnable.java @@ -0,0 +1,62 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.comp.bzip2; + +import java.io.IOException; + +/** + * This is used by the {@link BlockOutputStream} to encode a block in a separate + * encoding thread. It uses a {@link BlockEncoder} to do the actual encoding. + * @author Karl Gustafsson + * @since 1.1 + */ +final class BlockEncoderRunnable implements Runnable +{ + private final BlockEncoder m_encoder; + private final Object m_errorOwner; + + BlockEncoderRunnable(final BlockEncoder be, final Object errorOwner) + { + m_encoder = be; + m_errorOwner = errorOwner; + } + + public void run() + { + try + { + m_encoder.setScratchpad(((EncodingThread) Thread.currentThread()).getScratchpad()); + m_encoder.encode(); + } + catch (IOException e) + { + + ((EncodingThread) Thread.currentThread()).getErrorState().registerError(e, m_errorOwner); + } + catch (RuntimeException e) + { + ((EncodingThread) Thread.currentThread()).getErrorState().registerError(e, m_errorOwner); + } + catch (Error e) + { + + ((EncodingThread) Thread.currentThread()).getErrorState().registerError(e, m_errorOwner); + } + } +} diff --git a/src/main/java/org/at4j/comp/bzip2/BlockOutputStream.java b/src/main/java/org/at4j/comp/bzip2/BlockOutputStream.java new file mode 100644 index 0000000..5837bf2 --- /dev/null +++ b/src/main/java/org/at4j/comp/bzip2/BlockOutputStream.java @@ -0,0 +1,355 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.comp.bzip2; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.util.Arrays; + +import org.at4j.support.io.BitOutput; +import org.at4j.support.io.LittleEndianBitOutputStream; + +/** + * Used by {@link BZip2OutputStream} to RLE encode data and then write it to + * compressed blocks. + * @author Karl Gustafsson + * @since 1.1 + */ +final class BlockOutputStream extends OutputStream +{ + /** + * The different states of the run length encoder. + */ + private static enum RLEState + { + ENCODING_SINGLE, COUNTING_MULTIPLE; + } + + // The maximum number of encoded repeated bytes. + private static final int MAX_NO_OF_RLE_REPEATS = 251; + + // The state of the run length encoder. + private RLEState m_rleState; + // The last byte value that write was called with. Used to keep track of + // the run length encoding. + private int m_last = -1; + // How many equal bytes in a row has write been called with. Used to keep + // track of the run length encoding. + private int m_numberOfSame; + // Encoded data is written to this. + private final BitOutput m_wrapped; + // The size of a Burrows Wheeler block, in bytes. + private final int m_blockSize; + // How many times should the Huffman trees be refined before encoding data? + private final int m_numberOfHuffmanTreeRefinementIterations; + // Bit flags indicating which bytes that occur at least once in the current + // block. + private boolean[] m_seenDifferentBytesInCurBlock; + // The data in the current block. + private byte[] m_block; + // If we are using separate encoding threads, this executor is used to + // schedule blocks for execution. Otherwise it is null. + private final BZip2EncoderExecutorServiceImpl m_encodingExecutor; + // A token identifying who owns the errors that may be caused by jobs that + // we might schedule in the executor. This is null if no executor is used. + private final Object m_errorOwner; + + // Contains preallocated data structures. Used to reduce the number of + // temporary objects that are created and thus avoid time spent gc:ing. + // This is null if an executor is used for encoding. + private final EncodingScratchpad m_scratchpad; + + // If we use several encoder threads, this object is used for writing the + // encoded blocks in the right order. Otherwise it is null. + private final EncodedBlockWriter m_encodedBlockWriter; + + // The checksum for the current block. + private CRC m_blockChecksum; + // The checksum for the entire file. + private int m_fileChecksum = 0; + + // The number of different bytes seen in the current block. + private int m_noSeenDifferentBytesInCurBlock; + private int m_blockPointer; + + private int m_blockNo = 0; + + BlockOutputStream(BitOutput wrapped, int blockSize, int numberOfHuffmanTreeRefinementIterations, BZip2EncoderExecutorServiceImpl ex, Object errorOwner, EncodedBlockWriter ebw, EncodingScratchpad sp) + { + // Can only have one, not both. + assert ex == null ^ sp == null; + + m_wrapped = wrapped; + m_blockSize = blockSize; + m_numberOfHuffmanTreeRefinementIterations = numberOfHuffmanTreeRefinementIterations; + m_blockChecksum = new CRC(); + m_scratchpad = sp; + // May be null. + m_encodingExecutor = ex; + // May be null + m_errorOwner = errorOwner; + // May be null. + m_encodedBlockWriter = ebw; + + startNewBlock(); + } + + private void startNewBlock() + { + m_blockPointer = 0; + + if (m_encodingExecutor != null) + { + // We use several threads for encoding. Create new instances for + // data that may be used right now by an encoder. + m_seenDifferentBytesInCurBlock = new boolean[256]; + m_block = new byte[m_blockSize + ThreeWayRadixQuicksort.DATA_OVERSHOOT]; + } + else + { + // We encode in this thread. It is safe to reuse variables. + if (m_seenDifferentBytesInCurBlock == null) + { + m_seenDifferentBytesInCurBlock = new boolean[256]; + } + else + { + Arrays.fill(m_seenDifferentBytesInCurBlock, false); + } + + if (m_block == null) + { + m_block = new byte[m_blockSize + ThreeWayRadixQuicksort.DATA_OVERSHOOT]; + } + } + m_noSeenDifferentBytesInCurBlock = 0; + + // Reset the run length encoder state + m_last = -1; + m_numberOfSame = 0; + m_rleState = RLEState.ENCODING_SINGLE; + } + + private boolean isFull() + { + return m_blockPointer == m_blockSize; + } + + private boolean isEmpty() + { + return m_blockPointer == 0; + } + + int getFileChecksum() + { + return m_fileChecksum; + } + + /** + * Write a compressed data block. + */ + private void writeCurBlock() throws IOException + { + final int blockChecksum = m_blockChecksum.getValue(); + m_blockChecksum = new CRC(); + if (m_encodingExecutor == null) + { + // Encode the block in the current thread. + BlockEncoder be = new BlockEncoder(m_block, m_blockNo, m_blockPointer, blockChecksum, m_seenDifferentBytesInCurBlock, m_noSeenDifferentBytesInCurBlock, m_numberOfHuffmanTreeRefinementIterations, m_wrapped, null); + be.setScratchpad(m_scratchpad); + be.encode(); + } + else + { + // Hand off the block to another thread for encoding. + + // Allocate an output buffer that is 2/3rds of the size of the + // written data. + ByteArrayOutputStream baos = new ByteArrayOutputStream((2 * m_blockPointer) / 3); + BitOutput out = new LittleEndianBitOutputStream(baos); + BlockEncodedCallback bec = new BlockEncodedCallback(m_blockNo, baos, out, m_encodedBlockWriter); + BlockEncoder be = new BlockEncoder(m_block, m_blockNo, m_blockPointer, blockChecksum, m_seenDifferentBytesInCurBlock, m_noSeenDifferentBytesInCurBlock, m_numberOfHuffmanTreeRefinementIterations, out, bec); + m_encodingExecutor.execute(new BlockEncoderRunnable(be, m_errorOwner)); + } + + // Update the file checksum + m_fileChecksum = (m_fileChecksum << 1) | (m_fileChecksum >>> 31); + m_fileChecksum ^= blockChecksum; + + m_blockNo++; + } + + /** + * Write a single byte. + */ + private void writeByte(final int b) throws IOException + { + m_block[m_blockPointer++] = (byte) (b & 0xFF); + if (!m_seenDifferentBytesInCurBlock[b]) + { + m_seenDifferentBytesInCurBlock[b] = true; + m_noSeenDifferentBytesInCurBlock++; + } + + if (isFull()) + { + // File f = new File("/tmp/block_" + ++m_blockNo + ".dat"); + // OutputStream os = new BufferedOutputStream(new FileOutputStream(f)); + // try + // { + // os.write(m_block, 0, m_blockPointer); + // } + // finally + // { + // os.close(); + // } + + writeCurBlock(); + startNewBlock(); + } + } + + @Override + public void write(final int b) throws IOException + { + // Run length encode + switch (m_rleState) + { + case ENCODING_SINGLE: + if (b == m_last) + { + m_numberOfSame++; + if (m_numberOfSame == 4) + { + if (m_blockPointer == m_blockSize - 1) + { + // Corner case. bzip2 cannot handle blocks that end + // with four equal bytes. End this block one byte + // earlier. + writeCurBlock(); + startNewBlock(); + write(b); + return; + } + else + { + // Four equal in a row. Change state + m_rleState = RLEState.COUNTING_MULTIPLE; + m_numberOfSame = 0; + } + } + } + else + { + m_last = b; + m_numberOfSame = 1; + } + m_blockChecksum.update(b); + writeByte(b); + break; + + case COUNTING_MULTIPLE: + if (b == m_last) + { + m_numberOfSame++; + if (m_numberOfSame == MAX_NO_OF_RLE_REPEATS) + { + // Cannot repeat this anymore. Update checksum, write + // and switch state. + for (int i = 0; i < MAX_NO_OF_RLE_REPEATS; i++) + { + m_blockChecksum.update(b); + } + writeByte(MAX_NO_OF_RLE_REPEATS); + m_rleState = RLEState.ENCODING_SINGLE; + m_numberOfSame = 0; + } + } + else + { + // A byte that is not same as the last. Stop counting, + // update the checksum and change state. + for (int i = 0; i < m_numberOfSame; i++) + { + m_blockChecksum.update(m_last); + } + writeByte(m_numberOfSame); + m_blockChecksum.update(b); + writeByte(b); + m_numberOfSame = 1; + m_last = b; + m_rleState = RLEState.ENCODING_SINGLE; + } + break; + + default: + throw new RuntimeException("Unknown encoding state " + m_rleState + ". This is a bug"); + } + } + + @Override + public void write(final byte[] data) throws IOException + { + for (int i = 0; i < data.length; i++) + { + write(data[i] & 0xFF); + } + } + + @Override + public void write(final byte[] data, final int offset, final int len) throws IOException + { + // Range validation is done by BZip2OutputStream + for (int i = offset; i < offset + len; i++) + { + write(data[i] & 0xFF); + } + } + + @Override + public void close() throws IOException + { + if (m_rleState == RLEState.COUNTING_MULTIPLE) + { + // Update the checksum and write the current count. + for (int i = 0; i < m_numberOfSame; i++) + { + m_blockChecksum.update(m_last & 0xFF); + } + writeByte(m_numberOfSame); + } + + if (!isEmpty()) + { + writeCurBlock(); + } + + if (m_encodedBlockWriter != null) + { + // Tell the encoded block writer that we're done. + m_encodedBlockWriter.writeBlock(m_blockNo, null); + } + + // Don't close the wrapped BitOutput. It will be used later on to write + // the EOF block. + + super.close(); + } +} diff --git a/src/main/java/org/at4j/comp/bzip2/BurrowsWheelerDecoder.java b/src/main/java/org/at4j/comp/bzip2/BurrowsWheelerDecoder.java new file mode 100644 index 0000000..a9339bd --- /dev/null +++ b/src/main/java/org/at4j/comp/bzip2/BurrowsWheelerDecoder.java @@ -0,0 +1,120 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.comp.bzip2; + +import java.io.IOException; +import java.io.InputStream; + +/** + * Decode Burrows Wheeler encoded data. + * @author Karl Gustafsson + * @since 1.1 + */ +final class BurrowsWheelerDecoder +{ + static class BWInputStream extends InputStream + { + private final byte[] m_decoded; + private final int[] m_ptr; + + private int m_curPointer; + private boolean m_eof; + private int m_noLeftToRead; + + BWInputStream(byte[] decoded, int[] ptr, int originalDataPointer) + { + m_decoded = decoded; + m_ptr = ptr; + m_curPointer = ptr[originalDataPointer]; + m_noLeftToRead = ptr.length; + } + + @Override + public int read() throws IOException + { + if (m_eof) + { + return -1; + } + final int res = m_decoded[m_curPointer] & 0xFF; + m_eof = --m_noLeftToRead == 0; + m_curPointer = m_ptr[m_curPointer]; + return res; + } + } + + private final byte[] m_decoded; + private final int m_noBytesDecoded; + private final int[] m_byteFrequencies; + private final int m_originalDataPointer; + + /** + * @param encoded The encoded data. This array may be longer than the actual + * amount of encoded data. The {@code noBytesDecoded} parameter determines + * how much of the array that will be used. + * @param noBytesEncoded The length of the encoded data. + * @param byteFrequencies The number of times each byte occur in the data. + * @param originalDataPointer The row number of the original data in the + * Burrows Wheeler matrix. + * @throws IOException On I/O errors. + */ + BurrowsWheelerDecoder(byte[] encoded, int noBytesEncoded, int[] byteFrequencies, int originalDataPointer) throws IOException + { + if (originalDataPointer > noBytesEncoded) + { + throw new IOException("Invalid pointer to original data in block header " + originalDataPointer + ". It is larger than the size of data in the block " + noBytesEncoded); + } + + m_decoded = encoded; + m_noBytesDecoded = noBytesEncoded; + m_byteFrequencies = byteFrequencies; + m_originalDataPointer = originalDataPointer; + } + + InputStream decode() + { + // Calculate the transformation vector used to move from the encoded + // data to the decoded. + + // The byte frequency array contains the frequency of each byte in the + // data. Create a new array tarr that, for each byte, specifies how many + // bytes of lower value that occurs in the data. + int[] tarr = new int[256]; + tarr[0] = 0; + for (int i = 1; i < 256; i++) + { + tarr[i] = tarr[i - 1] + m_byteFrequencies[i - 1]; + } + + // The ptr array will contain a chain of positions of the decoded bytes + // in the decoded array. + final int[] ptr = new int[m_noBytesDecoded]; + for (int i = 0; i < m_noBytesDecoded; i++) + { + int val = m_decoded[i] & 0xFF; + // Get the position of the decoded byte position in tt. Increment + // the tt position for the given value so that next occurrence of the + // value will end up in the next position in tt. + int ttPos = tarr[val]++; + ptr[ttPos] = i; + } + + return new BWInputStream(m_decoded, ptr, m_originalDataPointer); + } +} diff --git a/src/main/java/org/at4j/comp/bzip2/BurrowsWheelerEncoder.java b/src/main/java/org/at4j/comp/bzip2/BurrowsWheelerEncoder.java new file mode 100644 index 0000000..83063d4 --- /dev/null +++ b/src/main/java/org/at4j/comp/bzip2/BurrowsWheelerEncoder.java @@ -0,0 +1,99 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.comp.bzip2; + +/** + * Burrows Wheeler encoder. + * @author Karl Gustafsson + * @since 1.1 + */ +final class BurrowsWheelerEncoder +{ + static class BurrowsWheelerEncodingResult + { + // The values of the last column of the matrix + final byte[] m_lastColumn; + // The row number of the first row (the row which contains the incoming + // data) in the sorted matrix + final int m_firstPointer; + + private BurrowsWheelerEncodingResult(byte[] lastColumn, int firstPointer) + { + m_lastColumn = lastColumn; + m_firstPointer = firstPointer; + } + } + + // The shortest length that will be quicksorted rather than shell sorted + private static int MIN_QUICKSORT_LENGTH = 18; + + // The data array containing the unencoded data. + private final byte[] m_data; + // The length of the data in the array. Data occupies the positions 0 to + // m_length - 1 in the array. + private final int m_length; + // Contains preallocated data structures. Used to reduce the number of + // temporary objects that are created and thus avoid time spent gc:ing. + private final EncodingScratchpad m_scratchpad; + + /** + * @param data This array should contain a 100 byte overshoot. See + * {@link ThreeWayRadixQuicksort#ThreeWayRadixQuicksort(byte[], int, int, EncodingScratchpad)} + * . + */ + BurrowsWheelerEncoder(byte[] data, int length, EncodingScratchpad sp) + { + if (length > data.length) + { + throw new IllegalArgumentException("Invalid data length " + length + ". It must be <= the length of the data array (" + data.length + ")"); + } + m_data = data; + m_length = length; + m_scratchpad = sp; + } + + /** + * Run a Burrows Wheeler encoding. + */ + BurrowsWheelerEncodingResult encode() + { + // Create all rotations of m_data, put them in a matrix and sort the + // first column. For each row in the matrix, ptr contains a pointer to + // the first byte of the row's m_data rotation. + int[] ptr = new ThreeWayRadixQuicksort(m_data, m_length, MIN_QUICKSORT_LENGTH, m_scratchpad).sort(); + + // Get the contents of the last column in the matrix. This, and the + // pointer to the ĺocation of where the first byte in m_data is in the + // last column, is the result from the Burrows Wheeler encoding. + byte[] lastColumn = m_scratchpad.m_lastColumn; + int firstRow = -1; + + for (int i = 0; i < m_length; i++) + { + int fePtr = ptr[i] - 1; + if (fePtr < 0) + { + fePtr += m_length; + firstRow = i; + } + lastColumn[i] = m_data[fePtr]; + } + return new BurrowsWheelerEncodingResult(lastColumn, firstRow); + } +} diff --git a/src/main/java/org/at4j/comp/bzip2/CRC.java b/src/main/java/org/at4j/comp/bzip2/CRC.java new file mode 100644 index 0000000..b7993c7 --- /dev/null +++ b/src/main/java/org/at4j/comp/bzip2/CRC.java @@ -0,0 +1,63 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.comp.bzip2; + +/** + * Checksum algorithm used by bzip2. + * @author Karl Gustafsson + * @since 1.1 + */ +final class CRC +{ + // Table from bzip2's crctable.c + private static final int[] CRC_TABLE = new int[] { 0x00000000, 0x04c11db7, 0x09823b6e, 0x0d4326d9, 0x130476dc, 0x17c56b6b, 0x1a864db2, 0x1e475005, 0x2608edb8, 0x22c9f00f, 0x2f8ad6d6, 0x2b4bcb61, 0x350c9b64, 0x31cd86d3, 0x3c8ea00a, + 0x384fbdbd, 0x4c11db70, 0x48d0c6c7, 0x4593e01e, 0x4152fda9, 0x5f15adac, 0x5bd4b01b, 0x569796c2, 0x52568b75, 0x6a1936c8, 0x6ed82b7f, 0x639b0da6, 0x675a1011, 0x791d4014, 0x7ddc5da3, 0x709f7b7a, 0x745e66cd, 0x9823b6e0, 0x9ce2ab57, + 0x91a18d8e, 0x95609039, 0x8b27c03c, 0x8fe6dd8b, 0x82a5fb52, 0x8664e6e5, 0xbe2b5b58, 0xbaea46ef, 0xb7a96036, 0xb3687d81, 0xad2f2d84, 0xa9ee3033, 0xa4ad16ea, 0xa06c0b5d, 0xd4326d90, 0xd0f37027, 0xddb056fe, 0xd9714b49, 0xc7361b4c, + 0xc3f706fb, 0xceb42022, 0xca753d95, 0xf23a8028, 0xf6fb9d9f, 0xfbb8bb46, 0xff79a6f1, 0xe13ef6f4, 0xe5ffeb43, 0xe8bccd9a, 0xec7dd02d, 0x34867077, 0x30476dc0, 0x3d044b19, 0x39c556ae, 0x278206ab, 0x23431b1c, 0x2e003dc5, 0x2ac12072, + 0x128e9dcf, 0x164f8078, 0x1b0ca6a1, 0x1fcdbb16, 0x018aeb13, 0x054bf6a4, 0x0808d07d, 0x0cc9cdca, 0x7897ab07, 0x7c56b6b0, 0x71159069, 0x75d48dde, 0x6b93dddb, 0x6f52c06c, 0x6211e6b5, 0x66d0fb02, 0x5e9f46bf, 0x5a5e5b08, 0x571d7dd1, + 0x53dc6066, 0x4d9b3063, 0x495a2dd4, 0x44190b0d, 0x40d816ba, 0xaca5c697, 0xa864db20, 0xa527fdf9, 0xa1e6e04e, 0xbfa1b04b, 0xbb60adfc, 0xb6238b25, 0xb2e29692, 0x8aad2b2f, 0x8e6c3698, 0x832f1041, 0x87ee0df6, 0x99a95df3, 0x9d684044, + 0x902b669d, 0x94ea7b2a, 0xe0b41de7, 0xe4750050, 0xe9362689, 0xedf73b3e, 0xf3b06b3b, 0xf771768c, 0xfa325055, 0xfef34de2, 0xc6bcf05f, 0xc27dede8, 0xcf3ecb31, 0xcbffd686, 0xd5b88683, 0xd1799b34, 0xdc3abded, 0xd8fba05a, 0x690ce0ee, + 0x6dcdfd59, 0x608edb80, 0x644fc637, 0x7a089632, 0x7ec98b85, 0x738aad5c, 0x774bb0eb, 0x4f040d56, 0x4bc510e1, 0x46863638, 0x42472b8f, 0x5c007b8a, 0x58c1663d, 0x558240e4, 0x51435d53, 0x251d3b9e, 0x21dc2629, 0x2c9f00f0, 0x285e1d47, + 0x36194d42, 0x32d850f5, 0x3f9b762c, 0x3b5a6b9b, 0x0315d626, 0x07d4cb91, 0x0a97ed48, 0x0e56f0ff, 0x1011a0fa, 0x14d0bd4d, 0x19939b94, 0x1d528623, 0xf12f560e, 0xf5ee4bb9, 0xf8ad6d60, 0xfc6c70d7, 0xe22b20d2, 0xe6ea3d65, 0xeba91bbc, + 0xef68060b, 0xd727bbb6, 0xd3e6a601, 0xdea580d8, 0xda649d6f, 0xc423cd6a, 0xc0e2d0dd, 0xcda1f604, 0xc960ebb3, 0xbd3e8d7e, 0xb9ff90c9, 0xb4bcb610, 0xb07daba7, 0xae3afba2, 0xaafbe615, 0xa7b8c0cc, 0xa379dd7b, 0x9b3660c6, 0x9ff77d71, + 0x92b45ba8, 0x9675461f, 0x8832161a, 0x8cf30bad, 0x81b02d74, 0x857130c3, 0x5d8a9099, 0x594b8d2e, 0x5408abf7, 0x50c9b640, 0x4e8ee645, 0x4a4ffbf2, 0x470cdd2b, 0x43cdc09c, 0x7b827d21, 0x7f436096, 0x7200464f, 0x76c15bf8, 0x68860bfd, + 0x6c47164a, 0x61043093, 0x65c52d24, 0x119b4be9, 0x155a565e, 0x18197087, 0x1cd86d30, 0x029f3d35, 0x065e2082, 0x0b1d065b, 0x0fdc1bec, 0x3793a651, 0x3352bbe6, 0x3e119d3f, 0x3ad08088, 0x2497d08d, 0x2056cd3a, 0x2d15ebe3, 0x29d4f654, + 0xc5a92679, 0xc1683bce, 0xcc2b1d17, 0xc8ea00a0, 0xd6ad50a5, 0xd26c4d12, 0xdf2f6bcb, 0xdbee767c, 0xe3a1cbc1, 0xe760d676, 0xea23f0af, 0xeee2ed18, 0xf0a5bd1d, 0xf464a0aa, 0xf9278673, 0xfde69bc4, 0x89b8fd09, 0x8d79e0be, 0x803ac667, + 0x84fbdbd0, 0x9abc8bd5, 0x9e7d9662, 0x933eb0bb, 0x97ffad0c, 0xafb010b1, 0xab710d06, 0xa6322bdf, 0xa2f33668, 0xbcb4666d, 0xb8757bda, 0xb5365d03, 0xb1f740b4 }; + + private int m_crc = 0xFFFFFFFF; + + /** + * @param b An integer value in the interval 0..255. + */ + void update(int b) + { + if ((b < 0) || (b > 255)) + { + throw new IllegalArgumentException("" + b); + } + + m_crc = (m_crc << 8) ^ CRC_TABLE[(m_crc >>> 24) ^ b]; + } + + int getValue() + { + return ~m_crc; + } +} diff --git a/src/main/java/org/at4j/comp/bzip2/CompressedDataBlock.java b/src/main/java/org/at4j/comp/bzip2/CompressedDataBlock.java new file mode 100644 index 0000000..7bfbcc0 --- /dev/null +++ b/src/main/java/org/at4j/comp/bzip2/CompressedDataBlock.java @@ -0,0 +1,51 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.comp.bzip2; + +import java.io.InputStream; + +/** + * A bzip2 block containing compressed data. + * @author Karl Gustafsson + * @since 1.1 + */ +final class CompressedDataBlock implements Block +{ + private final InputStream m_stream; + private final int m_blockChecksum; + + CompressedDataBlock(InputStream stream, int blockChecksum) + { + // Null check + stream.getClass(); + + m_stream = stream; + m_blockChecksum = blockChecksum; + } + + InputStream getStream() + { + return m_stream; + } + + int getBlockChecksum() + { + return m_blockChecksum; + } +} diff --git a/src/main/java/org/at4j/comp/bzip2/EncodedBlockData.java b/src/main/java/org/at4j/comp/bzip2/EncodedBlockData.java new file mode 100644 index 0000000..baceb2f --- /dev/null +++ b/src/main/java/org/at4j/comp/bzip2/EncodedBlockData.java @@ -0,0 +1,38 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.comp.bzip2; + +/** + * This object contains data for an encoded bzip2 block. + * @author Karl Gustafsson + * @since 1.1 + */ +final class EncodedBlockData +{ + final byte[] m_bytes; + final int m_noBits; + final int m_bitValue; + + EncodedBlockData(byte[] bytes, int noBits, int bitValue) + { + m_bytes = bytes; + m_noBits = noBits; + m_bitValue = bitValue; + } +} diff --git a/src/main/java/org/at4j/comp/bzip2/EncodedBlockWriter.java b/src/main/java/org/at4j/comp/bzip2/EncodedBlockWriter.java new file mode 100644 index 0000000..6a34f68 --- /dev/null +++ b/src/main/java/org/at4j/comp/bzip2/EncodedBlockWriter.java @@ -0,0 +1,146 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.comp.bzip2; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.CountDownLatch; + +import org.at4j.support.io.BitOutput; + +/** + * This is used to write encoded blocks in the right order when several encoding + * threads are used with the {@link BZip2OutputStream}. + * @author Karl Gustafsson + * @since 1.1 + */ +final class EncodedBlockWriter +{ + // All variables are protected by this object's intrinsic lock + private final BitOutput m_out; + private final Map m_savedBlocks = new HashMap(); + // This latch is used to signal to the bzip2 output stream when this writer + // is finished. + private final CountDownLatch m_doneLatch = new CountDownLatch(1); + private int m_nextBlockToWrite = 0; + private boolean m_hasError; + + EncodedBlockWriter(BitOutput out) + { + m_out = out; + } + + private void writeEncodedBlockData(final EncodedBlockData bd) throws IOException + { + m_out.writeBytes(bd.m_bytes, 0, bd.m_bytes.length); + if (bd.m_noBits > 0) + { + m_out.writeBits(bd.m_bitValue, bd.m_noBits); + } + } + + private void writeBlockInternal(final int blockNo, final EncodedBlockData blockData) throws IOException + { + if (blockData == null) + { + // We're done + m_doneLatch.countDown(); + } + else + { + writeEncodedBlockData(blockData); + + while (m_savedBlocks.containsKey(++m_nextBlockToWrite)) + { + final EncodedBlockData savedBd = m_savedBlocks.get(m_nextBlockToWrite); + if (savedBd != null) + { + writeEncodedBlockData(savedBd); + } + else + { + m_doneLatch.countDown(); + break; + } + } + } + } + + /** + * It is not time to write this block just yet. Save it until it is time. + * @param blockNo The block number. + * @param blockData The block data. + */ + private void saveBlock(final int blockNo, EncodedBlockData blockData) + { + m_savedBlocks.put(blockNo, blockData); + } + + /** + * Write the block data to the output if it is the next block to write. If + * not, queue it for later writing. + * @param blockNo The block number. + * @param blockData The block data or {@code null} as an end of stream + * marker. + * @throws IOException + */ + synchronized void writeBlock(final int blockNo, final EncodedBlockData blockData) throws IOException + { + if (m_hasError) + { + return; + } + + try + { + if (blockNo == m_nextBlockToWrite) + { + writeBlockInternal(blockNo, blockData); + } + else + { + saveBlock(blockNo, blockData); + } + } + catch (Error e) + { + m_hasError = true; + m_doneLatch.countDown(); + throw e; + } + catch (RuntimeException e) + { + m_hasError = true; + m_doneLatch.countDown(); + throw e; + } + catch (IOException e) + { + m_hasError = true; + m_doneLatch.countDown(); + throw e; + } + } + + void waitFor() throws InterruptedException + { + m_doneLatch.await(); + } +} diff --git a/src/main/java/org/at4j/comp/bzip2/EncodingScratchpad.java b/src/main/java/org/at4j/comp/bzip2/EncodingScratchpad.java new file mode 100644 index 0000000..c882ce7 --- /dev/null +++ b/src/main/java/org/at4j/comp/bzip2/EncodingScratchpad.java @@ -0,0 +1,107 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.comp.bzip2; + +/** + * This object contains different objects used by a bzip2 encoder thread. It is + * used to reduce the number of object and array allocations. + * @author Karl Gustafsson + * @since 1.1 + */ +final class EncodingScratchpad +{ + private static final int MAX_BLOCK_LENGTH = BZip2OutputStreamSettings.MAX_BLOCK_SIZE * 100 * 1000; + private static final int MAX_NO_OF_SEGMENTS = MAX_BLOCK_LENGTH / BlockEncoder.NO_OF_SYMBOLS_PER_SEGMENT; + + // An array that may contain the frequencies of each symbol in the data. + final int[] m_frequencies = new int[BlockEncoder.MAX_NO_OF_MTF_SYMBOLS]; + + // A move to front alphabet. + final byte[] m_mtfAlphabet = new byte[BlockEncoder.MAX_NO_OF_MTF_SYMBOLS]; + + // This two dimensional array can contain the frequencies for the different + // symbols encoded by the different trees (up to six trees) + final int[][] m_frequencies2d = new int[BlockEncoder.MAX_NO_OF_HUFFMAN_TREES][BlockEncoder.MAX_NO_OF_MTF_SYMBOLS]; + + // Contains MTF and RL encoded data before the Huffman encoding. The maximum + // size is the maximum size of a block + the EOB symbol. The actual size + // will probably be significantly shorter than this + final int[] m_encodedData = new int[MAX_BLOCK_LENGTH + 1]; + + // Frequencies of each two-byte combination used for the radix sort. + // Use an overshoot of one position. + final int[] m_twoByteFrequencies = new int[65536 + 1]; + + // Pointers created by the 3-way radix quicksort + final int[] m_ptrs = new int[MAX_BLOCK_LENGTH]; + + // A cache for sort results + final int[] m_sortCache = new int[MAX_BLOCK_LENGTH + ThreeWayRadixQuicksort.DATA_OVERSHOOT]; + + // Array for temporary data. This will be grown incrementally as the need + // arises. + int[] m_tempArea = new int[1024]; + + // Stack for block sorting + final ThreeWayRadixQuicksort.QuickSortRangeInfo[] m_sortStack = new ThreeWayRadixQuicksort.QuickSortRangeInfo[ThreeWayRadixQuicksort.SORT_STACK_SIZE]; + + // The results when all segments of a block is encoded with all available + // Huffman trees + final int[][] m_encodingResults = new int[MAX_NO_OF_SEGMENTS][BlockEncoder.MAX_NO_OF_HUFFMAN_TREES]; + + final int[] m_categoriesPerSegment = new int[MAX_NO_OF_SEGMENTS]; + + // The last column after Burrows Wheeler encoding + final byte[] m_lastColumn = new byte[MAX_BLOCK_LENGTH]; + + // The bucket sorting order + final int[] m_sortOrder = new int[256]; + // Used when scanning pointers + final int[] m_copyStart = new int[256]; + final int[] m_copyEnd = new int[256]; + + // Mapping between a symbol and its index number in the array of symbols + // used by the run length encoder. + final byte[] m_sequenceMap = new byte[256]; + + // Heap used when calculating Huffman tree code lengths + final int[] m_htHeap = new int[BlockEncoder.MAX_NO_OF_MTF_SYMBOLS + 2]; + final int[] m_htWeight = new int[BlockEncoder.MAX_NO_OF_MTF_SYMBOLS * 2]; + final int[] m_htParent = new int[BlockEncoder.MAX_NO_OF_MTF_SYMBOLS * 2]; + + // Flags for all sorted large buckets + final boolean[] m_sortedLargeBuckets = new boolean[256]; + // Flags for all sorted small buckets + final boolean[] m_sortedSmallBuckets = new boolean[256 * 256]; + + /** + * Get a temporary integer array of with a length of at least {@code len} + * integers. + */ + int[] getTemp(final int len) + { + // Is the current temp area large enough? + if (m_tempArea.length < len) + { + // No. Reallocate it + m_tempArea = new int[len + 100]; + } + return m_tempArea; + } +} diff --git a/src/main/java/org/at4j/comp/bzip2/EncodingThread.java b/src/main/java/org/at4j/comp/bzip2/EncodingThread.java new file mode 100644 index 0000000..7203639 --- /dev/null +++ b/src/main/java/org/at4j/comp/bzip2/EncodingThread.java @@ -0,0 +1,49 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.comp.bzip2; + +/** + * This is the kind of thread used for encoding bzip2 blocks. + * @author Karl Gustafsson + * @since 1.1 + */ +final class EncodingThread extends Thread +{ + private final EncodingScratchpad m_scratchpad = new EncodingScratchpad(); + private final ErrorState m_errorState; + + EncodingThread(Runnable r, ErrorState es) + { + super(r); + m_errorState = es; + } + + /** + * Get this thread's scratchpad. + */ + EncodingScratchpad getScratchpad() + { + return m_scratchpad; + } + + ErrorState getErrorState() + { + return m_errorState; + } +} diff --git a/src/main/java/org/at4j/comp/bzip2/EncodingThreadFactory.java b/src/main/java/org/at4j/comp/bzip2/EncodingThreadFactory.java new file mode 100644 index 0000000..bea7cf5 --- /dev/null +++ b/src/main/java/org/at4j/comp/bzip2/EncodingThreadFactory.java @@ -0,0 +1,41 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.comp.bzip2; + +import java.util.concurrent.ThreadFactory; + +/** + * This is a factory for creating {@link EncodingThread} objects. + * @author Karl Gustafsson + * @since 1.1 + */ +final class EncodingThreadFactory implements ThreadFactory +{ + private final ErrorState m_errorState; + + EncodingThreadFactory(ErrorState es) + { + m_errorState = es; + } + + public Thread newThread(Runnable r) + { + return new EncodingThread(r, m_errorState); + } +} diff --git a/src/main/java/org/at4j/comp/bzip2/EosBlock.java b/src/main/java/org/at4j/comp/bzip2/EosBlock.java new file mode 100644 index 0000000..9871d00 --- /dev/null +++ b/src/main/java/org/at4j/comp/bzip2/EosBlock.java @@ -0,0 +1,39 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.comp.bzip2; + +/** + * A bzip2 block containing end of stream information. + * @author Karl Gustafsson + * @since 1.1 + */ +final class EosBlock implements Block +{ + private final long m_readCrc; + + EosBlock(long readCrc) + { + m_readCrc = readCrc; + } + + long getReadCrc() + { + return m_readCrc; + } +} diff --git a/src/main/java/org/at4j/comp/bzip2/ErrorState.java b/src/main/java/org/at4j/comp/bzip2/ErrorState.java new file mode 100644 index 0000000..3cb97c6 --- /dev/null +++ b/src/main/java/org/at4j/comp/bzip2/ErrorState.java @@ -0,0 +1,52 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.comp.bzip2; + +import java.io.IOException; + +/** + * This is used to keep track of encoding errors. + *

+ * Every error is registered with an owner token that is a unique identifier for + * the object that is affected by the error. The owner token object must have a + * good {@link Object#hashCode()} method. + * @author Karl Gustafsson + * @since 1.1 + */ +interface ErrorState +{ + /** + * Register an {@link Exception} or an {@link Error}. + * @param t The exception or error. + * @param ownerToken A unique identifier for the error owner, i.e. the + * object that the encoding thread is performing work for. + */ + void registerError(Throwable t, Object ownerToken); + + /** + * Check for errors. + * @param ownerToken The owner. + * @throws Error If there is a registered {@link Error} for this owner. + * @throws RuntimeException If there is a registered + * {@link RuntimeException} for this owner. + * @throws IOException If there is a registered {@link IOException} for this + * owner. + */ + void checkAndClearErrors(Object ownerToken) throws Error, RuntimeException, IOException; +} diff --git a/src/main/java/org/at4j/comp/bzip2/HighValueBranchHuffmanTree.java b/src/main/java/org/at4j/comp/bzip2/HighValueBranchHuffmanTree.java new file mode 100644 index 0000000..fb34bd3 --- /dev/null +++ b/src/main/java/org/at4j/comp/bzip2/HighValueBranchHuffmanTree.java @@ -0,0 +1,438 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.comp.bzip2; + +import java.io.IOException; +import java.util.Arrays; + +import org.at4j.support.io.BitInput; +import org.at4j.support.io.BitOutput; + +/** + * This object represents the type of Huffman tree that is used by bzip2. The + * "high value branch" means that leaf nodes have the smallest possible values + * and non-leaf nodes have the highest possible values at each tree depth. + * @author Karl Gustafsson + * @since 1.1 + */ +final class HighValueBranchHuffmanTree +{ + private static final int MAX_NO_OF_SYMBOLS = 258; + + // The shortest code length for symbols in this tree. + private final int m_minLength; + // The longest code length for symbols in this tree. + private final int m_maxLength; + // m_maxLength - m_minLength + 1; + // Declared package private for the unit tests. + final int m_numberOfLengths; + + // The value limit at each data length, i.e. the maximum value for leaf + // nodes at that data length. + // Declared package private for the unit tests. + final int[] m_limitsPerLength; + // The lowest value for a symbol at each length. The value for length + // m_minLength is at index 0 in the array. + // Declared package private for the unit tests. + final int[] m_baseValuesPerLength; + // The offset in the m_symbolSequenceNos array for the first symbol for each + // Huffman code length. The array has the length m_maxLength - m_minLength + + // 1. The value for m_minLength is at index 0 (and is 0). + // Declared package private for the unit tests. + final int[] m_symbolOffsetPerLength; + // The index of the symbol table for Huffman code no n. + // Declared package private for the unit tests. + final int[] m_symbolSequenceNos; + // This table contains the Huffman codes and the code bit lengths for each + // symbol. It is created when using the constructor that calculates the + // Huffman trees to speed up encoding. + final int[][] m_huffmanCodesAndLengthsPerSymbol; + + /** + * Get the Huffman code and its bit length for a symbol. + * @param symbol The symbol. + * @param huffmanIndex The symbol's index in the list of sorted symbols. + * @param codeAndLength An int array of length 2 used to store the result + * in. + */ + private int[] getCodeAndLengthForSymbol(final int symbol, final int huffmanIndex, final int[] codeAndLength) + { + // Calculate the length of the synbol's Huffman code + int deltaLen; + for (deltaLen = 0; deltaLen < m_numberOfLengths - 1; deltaLen++) + { + if (huffmanIndex < m_symbolOffsetPerLength[deltaLen + 1]) + { + break; + } + } + + codeAndLength[0] = m_baseValuesPerLength[deltaLen] + (huffmanIndex - m_symbolOffsetPerLength[deltaLen]); + codeAndLength[1] = m_minLength + deltaLen; + return codeAndLength; + } + + /** + * Create a canonical Huffman tree for the supplied symbols. + *

+ * Symbol lengths for a canonical Huffman tree can be created by the + * {@link #createCodeLengths(int[], int, int)} method. + * @param symbolLengths The length of the Huffman code for each symbol. + * @param minLength The shortest Huffman code length in the tree. + * @param maxLength The longest Huffman code length in the tree. + * @param forEncoding Should the tree be used for encoding? If so, a loookup + * table that contains the Huffman code for each symbol is created to speed + * up the encoding. + * @throws IllegalArgumentException If the lengths are invalid. + */ + HighValueBranchHuffmanTree(final int[] symbolLengths, final int minLength, final int maxLength, final boolean forEncoding) throws IllegalArgumentException + { + if ((minLength < 0) || (maxLength < minLength)) + { + throw new IllegalArgumentException("Illegal min or max length, min: " + minLength + ", max: " + maxLength); + } + + final int numberOfSymbols = symbolLengths.length; + final int numberOfLengths = maxLength - minLength + 1; + // Create a array of symbol sequence numbers sorted on their symbol + // lengths + m_symbolSequenceNos = new int[numberOfSymbols]; + // The number of symbols having each code length + final int[] numl = new int[numberOfLengths]; + int index = 0; + for (int i = minLength; i <= maxLength; i++) + { + numl[i - minLength] = 0; + for (int j = 0; j < numberOfSymbols; j++) + { + if (symbolLengths[j] == i) + { + m_symbolSequenceNos[index++] = j; + numl[i - minLength]++; + } + } + } + + m_symbolOffsetPerLength = new int[numberOfLengths]; + m_symbolOffsetPerLength[0] = 0; + for (int i = 0; i < numberOfLengths - 1; i++) + { + m_symbolOffsetPerLength[i + 1] = m_symbolOffsetPerLength[i] + numl[i]; + } + + // The value limit at each length + m_limitsPerLength = new int[numberOfLengths - 1]; + m_baseValuesPerLength = new int[numberOfLengths]; + int prevLimit = 0; + for (int i = minLength; i <= maxLength; i++) + { + index = i - minLength; + // The base value for this length is the value of the smallest + // allowed symbol for this length. The smallest allowed symbol is + // the limit for the previous length with a zero at the end. + m_baseValuesPerLength[index] = prevLimit << 1; + + if (i < maxLength) + { + // The limit for this length is the base value for this length + // plus the number of symbols for this length. + prevLimit = m_baseValuesPerLength[index] + numl[index]; + m_limitsPerLength[index] = prevLimit - 1; + } + } + + m_minLength = minLength; + m_maxLength = maxLength; + m_numberOfLengths = (byte) (maxLength - minLength + 1); + if (forEncoding) + { + // Create an inverse mapping into the list of sorted symbols + final int[] huffmanIndexPerSymbol = new int[symbolLengths.length]; + Arrays.fill(huffmanIndexPerSymbol, -1); + for (int i = 0; i < m_symbolSequenceNos.length; i++) + { + huffmanIndexPerSymbol[m_symbolSequenceNos[i]] = i; + } + + // Create a table containing the Huffman code and its bit length for + // each symbol. This is used to speed up writes. + m_huffmanCodesAndLengthsPerSymbol = new int[symbolLengths.length][2]; + int[] codeAndLength = new int[2]; + for (int i = 0; i < symbolLengths.length; i++) + { + codeAndLength = getCodeAndLengthForSymbol(i, huffmanIndexPerSymbol[i], codeAndLength); + m_huffmanCodesAndLengthsPerSymbol[i][0] = codeAndLength[0]; + m_huffmanCodesAndLengthsPerSymbol[i][1] = codeAndLength[1]; + } + } + else + { + // Don't create these variables. They are only used when writing data + // and it is assumed that this constructor will only be used to create + // trees for reading data. + m_huffmanCodesAndLengthsPerSymbol = null; + } + } + + private static void upHeap(final int[] heap, final int[] weight, int nHeap) + { + int tmp = heap[nHeap]; + while (weight[tmp] < weight[heap[nHeap >> 1]]) + { + heap[nHeap] = heap[nHeap >>> 1]; + nHeap >>>= 1; + } + heap[nHeap] = tmp; + } + + private static void downHeap(final int[] heap, final int[] weight, final int nHeap, int n) + { + int tmp = heap[n]; + while (true) + { + int yy = n << 1; + if (yy > nHeap) + { + break; + } + if (yy < nHeap && weight[heap[yy + 1]] < weight[heap[yy]]) + { + yy++; + } + if (weight[tmp] < weight[heap[yy]]) + { + break; + } + heap[n] = heap[yy]; + n = yy; + } + heap[n] = tmp; + } + + private static int addWeights(final int w1, final int w2) + { + final int d1 = w1 & 0xFF; + final int d2 = w2 & 0xFF; + final int ww1 = w1 & 0xFFFFFF00; + final int ww2 = w2 & 0xFFFFFF00; + return (ww1 + ww2) | (1 + (d1 > d2 ? d1 : d2)); + } + + int getMinLength() + { + return m_minLength; + } + + int getMaxLength() + { + return m_maxLength; + } + + /** + * Get a sorted array with symbol sequence numbers and their Huffman code + * lengths. The returned array is sorted with the most frequent occurring + * symbol first (i.e. the symbol with the shortest Huffman code). + *

+ * This method is used for testing. + * @return Array a[n][0] = symbol, a[n][1] = Huffman code length + */ + int[][] getSortedSymbolSequenceNosAndCodeLengths() + { + int[][] res = new int[m_symbolSequenceNos.length][2]; + int length = m_minLength; + for (int i = 0; i < m_symbolSequenceNos.length; i++) + { + while ((length < m_maxLength) && (i >= m_symbolOffsetPerLength[length - m_minLength + 1])) + { + length++; + } + res[i][0] = m_symbolSequenceNos[i]; + res[i][1] = length; + } + return res; + } + + /** + * Read the next symbol. + * @param in The input to read the symbol from. + * @return The next symbol. + * @throws IOException On I/O errors. + */ + int readNext(final BitInput in) throws IOException + { + int code = in.readBits(m_minLength); + // m_limitsPerLength.length == 0 means that all Huffman codes have the + // same length. + if (m_limitsPerLength.length == 0 || code <= m_limitsPerLength[0]) + { + return m_symbolSequenceNos[code]; + } + else + { + int codeLength = m_minLength; + int index = 1; + while (true) + { + code = (code << 1) | (in.readBit() ? 1 : 0); + codeLength++; + if ((codeLength == m_maxLength) || (code <= m_limitsPerLength[index])) + { + return m_symbolSequenceNos[m_symbolOffsetPerLength[index] + (code - m_baseValuesPerLength[index])]; + } + index++; + } + } + } + + /** + * Write a symbol. + * @param out The output to write to. + * @param symbol The symbol to write. + * @throws IOException On I/O errors. + */ + void write(final BitOutput out, final int symbol) throws IOException + { + out.writeBitsLittleEndian(m_huffmanCodesAndLengthsPerSymbol[symbol][0], m_huffmanCodesAndLengthsPerSymbol[symbol][1]); + } + + /** + * Get the number of bits used for encoding the symbol. + */ + int getBitLength(int symbol) + { + return m_huffmanCodesAndLengthsPerSymbol[symbol][1]; + } + + /** + * Calculate the Huffman code lengths for the optimal, depth-limited Huffman + * tree for the supplied symbol frequencies. + *

+ * This method uses the (slightly magic) algorithm from bzip2 1.0.5. + * @param frequencies The frequencies for each symbol in the data to be + * encoded. + * @param noSymbols The number of different symbols in the data to encode. + * This should be the maximum symbol value (the EOB symbol's value) + 1. + * @param maxLength The maximum code length which also will be the depth of + * the Huffman tree. If this is too small, this method will get stuck in an + * infinite loop. + * @return The Huffman code lengths for each symbol. + */ + static int[] createCodeLengths(final int[] frequencies, final int noSymbols, final int maxLength, final EncodingScratchpad scratchpad) + { + /* + * Nodes and heap entries run from 1. Entry 0 for both the heap and + * nodes is a sentinel. + */ + + final int[] heap = scratchpad.m_htHeap; + final int[] weight = scratchpad.m_htWeight; + final int[] parent = scratchpad.m_htParent; + + final int[] res = new int[noSymbols]; + + int actualMaxLength = -1; + int actualMinLength = Integer.MAX_VALUE; + + for (int i = 0; i < noSymbols; i++) + { + weight[i + 1] = (frequencies[i] == 0 ? 1 : frequencies[i]) << 8; + } + + while (true) + { + int noNodes = noSymbols; + int nHeap = 0; + + heap[0] = 0; + weight[0] = 0; + parent[0] = -2; + + for (int i = 1; i <= noSymbols; i++) + { + parent[i] = -1; + nHeap++; + heap[nHeap] = i; + upHeap(heap, weight, nHeap); + } + + assert nHeap < MAX_NO_OF_SYMBOLS + 2; + + while (nHeap > 1) + { + int n1 = heap[1]; + heap[1] = heap[nHeap]; + nHeap--; + downHeap(heap, weight, nHeap, 1); + int n2 = heap[1]; + heap[1] = heap[nHeap]; + nHeap--; + downHeap(heap, weight, nHeap, 1); + noNodes++; + parent[n1] = parent[n2] = noNodes; + weight[noNodes] = addWeights(weight[n1], weight[n2]); + parent[noNodes] = -1; + nHeap++; + heap[nHeap] = noNodes; + upHeap(heap, weight, nHeap); + } + + assert noNodes < MAX_NO_OF_SYMBOLS * 2; + + boolean tooLong = false; + INNER: for (int i = 1; i <= noSymbols; i++) + { + int j = 0; + int k = i; + while (parent[k] >= 0) + { + k = parent[k]; + j++; + } + res[i - 1] = j; + if (j > maxLength) + { + tooLong = true; + break INNER; + } + + if (j > actualMaxLength) + { + actualMaxLength = j; + } + if (j < actualMinLength) + { + actualMinLength = j; + } + } + + if (!tooLong) + { + break; + } + + for (int i = 1; i <= noSymbols; i++) + { + int j = weight[i] >> 8; + j = 1 + (j / 2); + weight[i] = j << 8; + } + } + return res; + } +} diff --git a/src/main/java/org/at4j/comp/bzip2/MultipleObserverErrorState.java b/src/main/java/org/at4j/comp/bzip2/MultipleObserverErrorState.java new file mode 100644 index 0000000..b7ac53d --- /dev/null +++ b/src/main/java/org/at4j/comp/bzip2/MultipleObserverErrorState.java @@ -0,0 +1,67 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.comp.bzip2; + +import java.io.IOException; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +/** + * This {@link ErrorState} may have several observers which forces us to have to + * care about the owner of each registered error. + *

+ * This is used when sharing the same + * {@link java.util.concurrent.ExecutorService} between several + * {@link BZip2OutputStream}:s. + * @author Karl Gustafsson + * @since 1.1 + */ +final class MultipleObserverErrorState implements ErrorState +{ + private Map m_errors = new ConcurrentHashMap(4); + + public void checkAndClearErrors(Object ownerToken) throws Error, RuntimeException, IOException + { + Throwable t = m_errors.remove(ownerToken); + if (t != null) + { + if (t instanceof IOException) + { + throw (IOException) t; + } + else if (t instanceof RuntimeException) + { + throw (RuntimeException) t; + } + else if (t instanceof Error) + { + throw (Error) t; + } + else + { + throw new RuntimeException(t); + } + } + } + + public void registerError(Throwable t, Object ownerToken) + { + m_errors.put(ownerToken, t); + } +} diff --git a/src/main/java/org/at4j/comp/bzip2/RLEDecodingInputStream.java b/src/main/java/org/at4j/comp/bzip2/RLEDecodingInputStream.java new file mode 100644 index 0000000..753ba57 --- /dev/null +++ b/src/main/java/org/at4j/comp/bzip2/RLEDecodingInputStream.java @@ -0,0 +1,164 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.comp.bzip2; + +import java.io.IOException; +import java.io.InputStream; + +/** + * This stream run length decodes read data. It is used by the + * {@link BZip2InputStream}. + * @author Karl Gustafsson + * @since 1.1 + */ +final class RLEDecodingInputStream extends InputStream +{ + private static enum RLEState + { + READING, REPEATING, ABOUT_TO_READ_HOW_MANY_TO_REPEAT, EOF; + } + + // Block checksum calculated while reading the block contents. + private final CRC m_blockChecksum = new CRC(); + private final InputStream m_wrapped; + private final long m_readChecksum; + + private RLEState m_state; + + private int m_noLeftToRepeat; + private int m_last; + private int m_numberOfSimilar; + + RLEDecodingInputStream(InputStream wrapped, long readChecksum) + { + m_wrapped = wrapped; + m_readChecksum = readChecksum; + m_state = RLEState.READING; + m_numberOfSimilar = 0; + m_last = -1; + } + + private void handleEof() throws IOException + { + if (m_blockChecksum.getValue() != m_readChecksum) + { + throw new IOException("Invalid block checksum. Was " + m_blockChecksum.getValue() + ", expected " + m_readChecksum); + } + } + + @Override + public int read() throws IOException + { + switch (m_state) + { + case EOF: + return -1; + + case READING: + int val = m_wrapped.read(); + if (val == -1) + { + m_state = RLEState.EOF; + handleEof(); + return -1; + } + if (val == m_last) + { + m_numberOfSimilar++; + if (m_numberOfSimilar == 4) + { + // Four in a row. The next value is a repeat number. + m_state = RLEState.ABOUT_TO_READ_HOW_MANY_TO_REPEAT; + m_numberOfSimilar = 0; + } + } + else + { + m_numberOfSimilar = 1; + m_last = val; + } + m_blockChecksum.update(val); + return val; + + case ABOUT_TO_READ_HOW_MANY_TO_REPEAT: + m_noLeftToRepeat = m_wrapped.read(); + if (m_noLeftToRepeat == -1) + { + // A rather unexpected EOF + m_state = RLEState.EOF; + handleEof(); + return -1; + } + else if (m_noLeftToRepeat == 0) + { + // Nothing to repeat. Go on to read the next value. + m_state = RLEState.READING; + return read(); + } + else + { + m_state = RLEState.REPEATING; + m_noLeftToRepeat--; + if (m_noLeftToRepeat == 0) + { + // Just one to repeat, which we will do in this call. + m_state = RLEState.READING; + } + m_blockChecksum.update(m_last); + return m_last; + } + + case REPEATING: + m_noLeftToRepeat--; + if (m_noLeftToRepeat == 0) + { + m_state = RLEState.READING; + } + m_blockChecksum.update(m_last); + return m_last; + + default: + throw new RuntimeException("Unknown state " + m_state + ". This is a bug"); + } + } + + @Override + public int read(byte[] barr, int off, int len) throws IOException + { + // The ranges are validated by BZip2InputStream + for (int i = 0; i < len; i++) + { + int b = read(); + if (b < 0) + { + // EOF + return i > 0 ? i : -1; + } + barr[off + i] = (byte) (b & 0xFF); + } + return len; + } + + @Override + public void close() throws IOException + { + m_wrapped.close(); + super.close(); + } +} diff --git a/src/main/java/org/at4j/comp/bzip2/SingleObserverErrorState.java b/src/main/java/org/at4j/comp/bzip2/SingleObserverErrorState.java new file mode 100644 index 0000000..8155151 --- /dev/null +++ b/src/main/java/org/at4j/comp/bzip2/SingleObserverErrorState.java @@ -0,0 +1,63 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.comp.bzip2; + +import java.io.IOException; +import java.util.concurrent.atomic.AtomicReference; + +/** + * This is used to propagate errors from encoding threads to the thread using + * the {@link BZip2OutputStream} when there is only one object using the + * encoder. + * @author Karl Gustafsson + * @since 1.1 + */ +final class SingleObserverErrorState implements ErrorState +{ + private final AtomicReference m_exception = new AtomicReference(); + + public void checkAndClearErrors(Object ownerToken) throws Error, RuntimeException, IOException + { + Throwable t = m_exception.getAndSet(null); + if (t != null) + { + if (t instanceof IOException) + { + throw (IOException) t; + } + else if (t instanceof RuntimeException) + { + throw (RuntimeException) t; + } + else if (t instanceof Error) + { + throw (Error) t; + } + else + { + throw new RuntimeException(t); + } + } + } + + public void registerError(Throwable t, Object ownerToken) + { + m_exception.set(t); + } +} diff --git a/src/main/java/org/at4j/comp/bzip2/ThreeWayRadixQuicksort.java b/src/main/java/org/at4j/comp/bzip2/ThreeWayRadixQuicksort.java new file mode 100644 index 0000000..2d76fdb --- /dev/null +++ b/src/main/java/org/at4j/comp/bzip2/ThreeWayRadixQuicksort.java @@ -0,0 +1,992 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.comp.bzip2; + +import java.util.Arrays; + +/** + * This sort algorithm is used by the Burrows Wheeler encoder to sort the data + * to encode. It is an amalgation of three different sort algorithms. Radix sort + * is used to divide the input into 65536 different buckets. The quicksort is + * used to sort each bucket. When the quicksort iterations produce short enough + * blocks, shell sort is used. + *

+ * See Dr. Dobb's Journal from + * November 01 1998. + * @author Karl Gustafsson + * @since 1.1 + */ +final class ThreeWayRadixQuicksort +{ + // The amount of overshoot in the data. See below. + static final int DATA_OVERSHOOT = 20; + + // The deepest sort that we do with quicksort. Deeper sorts use shell sort. + // This value should be less than the DATA_OVERSHOOT. + private static final int QUICKSORT_DEPTH_THRESHOLD = 18; + + // The size of the sorting stack. This size is the same as for bzip2 1.0.5. + static final int SORT_STACK_SIZE = 100; + + /** + * The increments for shell sort. Borrowed from bzip2. + *

+ * Knuth's increments seem to work better than Incerpi-Sedgewick here. + * Possibly because the number of elems to sort is usually small, typically + * <= 20. + */ + private static final int[] SHELL_SORT_INCREMENTS = { 1, 4, 13, 40, 121, 364, 1093, 3280, 9841, 29524, 88573, 265720, 797161, 2391484 }; + + // Declared package private for the unit tests + static class QuickSortRangeInfo + { + private final int m_bucketStartPos; + // The length of the bucket measured in number of symbols. + private final int m_bucketLen; + private final int m_depth; + + QuickSortRangeInfo(int bucketStartPos, int bucketLen, int depth) + { + m_bucketStartPos = bucketStartPos; + m_bucketLen = bucketLen; + m_depth = depth; + } + } + + // The data array. + private final byte[] m_data; + // The length of the data in the array. Data occupies the positions 0 to + // m_length - 1 in the array. + private final int m_length; + // The shortest data block length that quicksort will be used for. For + // shorter blocks, shell sort is used. + private final int m_minLengthForQuicksort; + // Contains preallocated data structures. Used to reduce the number of + // temporary objects that are created and thus avoid time spent gc:ing. + private final EncodingScratchpad m_scratchpad; + // Cache with sort results that are used to speed up the sorting. This works + // because all strings to sort are rotations of a single string. + private final int[] m_sortCache; + // Use a stack of sort range information instead of calling the quicksort + // methods recursively. + private final QuickSortRangeInfo[] m_sortStack; + // A pointer to the current position in the sort stack. + private int m_sortStackPointer = -1; + // Array containing a pointer for each element in m_data to its location in + // the sorted data. + // This is declared package private for the unit tests. + final int[] m_ptr; + + /** + * Create a new sorting object. + * @param data The data to sort. This array should contain an overshoot of + * {@code DATA_OVERSHOOT} bytes. I.e: the data array should have a length of + * at least {@code length + DATA_OVERSHOOT} bytes, and the last {@code + * DATA_OVERSHOOT} bytes should be equal to the first {@code DATA_OVERSHOOT} + * bytes. This makes a few sorting optimizations possible. + *

+ * If the length of the data is less than {@code DATA_OVERSHOOT} bytes, the + * overshoot should contain the data repeated. + * @param minLengthForQuicksort Segments that are shorter than this length + * are sorted with shell sort instead of quicksort. + */ + ThreeWayRadixQuicksort(final byte[] data, final int length, final int minLengthForQuicksort, final EncodingScratchpad sp) throws IllegalArgumentException + { + assert data.length >= length + DATA_OVERSHOOT; + + if (length > data.length) + { + throw new IllegalArgumentException("Invalid data length " + length + ". It must be <= the length of the data array (" + data.length + ")"); + } + if (minLengthForQuicksort < 3) + { + throw new IllegalArgumentException("Invalid minimum length for Quicksort " + minLengthForQuicksort + ". It must be >= 3"); + } + m_data = data; + m_length = length; + m_minLengthForQuicksort = minLengthForQuicksort; + m_scratchpad = sp; + m_sortStack = m_scratchpad.m_sortStack; + // Clear the sortCache array + m_sortCache = m_scratchpad.m_sortCache; + Arrays.fill(m_sortCache, 0); + m_ptr = m_scratchpad.m_ptrs; + } + + /** + * Get the data at the specified position. It is assumed that the position + * is within the range of the data. + *

+ * This method is so small so that it will likely be inlined by the Java + * compiler. + */ + private int getDataAt(final int pos) + { + return m_data[pos] & 0xFF; + } + + /** + * Make the initial radix sort of the data into 65536 buckets. As a side + * effect, this method populates the {@code m_ptr} array with the results of + * the sort. + *

+ * This method is declared package-private for the unit tests. + * @return The start positions for each bucket (in the {@code m_ptr} array). + */ + int[] radixSort() + { + // This array will contain the frequencies of each two byte combination + // in the data. + final int[] frequencies = m_scratchpad.m_twoByteFrequencies; + Arrays.fill(frequencies, 0); + + // Iterate over the data and collect the frequencies of each occurring + // two byte combination. + int val = getDataAt(0) << 8; + for (int i = m_length - 1; i >= 0; i--) + { + val = val >>> 8 | (getDataAt(i) << 8); + frequencies[val]++; + } + + // Convert the frequencies array to contain the last data element + // position + 1 for each two byte bucket. + for (int i = 1; i < 65536; i++) + { + frequencies[i] += frequencies[i - 1]; + } + + // The m_ptr array will contain the pointers between each two byte + // combination's bucket location and its location in the data array. + // This loop will also modify the frequencies array to contain the + // starting position of each data bucket. + val = getDataAt(0) << 8; + for (int i = m_length - 1; i >= 0; i--) + { + val = val >>> 8 | (getDataAt(i) << 8); + int pos = --frequencies[val]; + m_ptr[pos] = i; + } + + // Now frequencies contain the first location of each bucket and m_ptr + // contains pointers between the data locations in the buckets and the + // data in the data array. + + return frequencies; + } + + /** + * Get the position that contains the median of the values at the three + * positions. + */ + private int med3(final int pos1, final int pos2, final int pos3, final int depth) + { + int v1, v2, v3; + if ((v1 = getDataAt(m_ptr[pos1] + depth)) == (v2 = getDataAt(m_ptr[pos2] + depth))) + { + return pos1; + } + if (((v3 = getDataAt(m_ptr[pos3] + depth)) == v1) || (v3 == v2)) + { + return pos3; + } + return v1 < v2 ? (v2 < v3 ? pos2 : (v1 < v3 ? pos3 : pos1)) : (v2 > v3 ? pos2 : (v1 < v3 ? pos1 : pos3)); + } + + /** + * Select the pivot value for the quicksort. + * @return The position of the pivot value. + */ + private int selectPivot(final QuickSortRangeInfo qsri) + { + int pos1 = qsri.m_bucketStartPos; + int pos3 = pos1 + qsri.m_bucketLen - 1; + int pos2 = (pos1 + pos3) / 2; + + // For a large bucket, use a median of three median values + if (qsri.m_bucketLen > 500) + { + int d = qsri.m_bucketLen / 8; + pos1 = med3(pos1, pos1 + d, pos1 + 2 * d, qsri.m_depth); + pos2 = med3(pos2 - d, pos2, pos2 + d, qsri.m_depth); + pos3 = med3(pos3 - 2 * d, pos3 - d, pos3, qsri.m_depth); + } + return med3(pos1, pos2, pos3, qsri.m_depth); + } + + /** + * Swap the elements in the two positions in the array. + */ + private void swap(final int pos1, final int pos2) + { + int v1 = m_ptr[pos1]; + m_ptr[pos1] = m_ptr[pos2]; + m_ptr[pos2] = v1; + } + + /** + * Shell sort the data in the range. This is used for data ranges that are + * too short to be quicksorted. + *

+ * This method is declared package private for the unit tests. + */ + void shellSortRange(final QuickSortRangeInfo qsri) + { + // If the implementation of this method looks strange it is because it + // is heavily optimized. + + final int len = qsri.m_bucketLen; + final int depth = qsri.m_depth; + final int startPos = qsri.m_bucketStartPos; + final int endPos = startPos + len; + int incMax = 1; + while (SHELL_SORT_INCREMENTS[incMax] < len) + { + incMax++; + } + + for (int incrementPtr = incMax - 1; incrementPtr >= 0; incrementPtr--) + { + final int increment = SHELL_SORT_INCREMENTS[incrementPtr]; + final int startIter = startPos + increment; + for (int i = startIter; i < endPos; i++) + { + INCLOOP: for (int j = i; j >= startIter; j -= increment) + { + int curDepth = depth; + int curPos1 = m_ptr[j - increment] + depth - 1; + int curPos2 = m_ptr[j] + depth - 1; + + // Tests with sort cache lookups. + // Inner loop. + while (true) + { + while (curPos1 >= m_length) + { + curPos1 -= m_length; + } + while (curPos2 >= m_length) + { + curPos2 -= m_length; + } + + // Eight tests with sort cache lookups. The data + // overshoot helps us to avoid range checks when + // the pointers are incremented. + if (getDataAt(++curPos1) == getDataAt(++curPos2)) + { + if (m_sortCache[curPos1] == m_sortCache[curPos2]) + { + // 2 + if (getDataAt(++curPos1) == getDataAt(++curPos2)) + { + if (m_sortCache[curPos1] == m_sortCache[curPos2]) + { + // 3 + if (getDataAt(++curPos1) == getDataAt(++curPos2)) + { + if (m_sortCache[curPos1] == m_sortCache[curPos2]) + { + // 4 + if (getDataAt(++curPos1) == getDataAt(++curPos2)) + { + if (m_sortCache[curPos1] == m_sortCache[curPos2]) + { + // 5 + if (getDataAt(++curPos1) == getDataAt(++curPos2)) + { + if (m_sortCache[curPos1] == m_sortCache[curPos2]) + { + // 6 + if (getDataAt(++curPos1) == getDataAt(++curPos2)) + { + if (m_sortCache[curPos1] == m_sortCache[curPos2]) + { + // 7 + if (getDataAt(++curPos1) == getDataAt(++curPos2)) + { + if (m_sortCache[curPos1] == m_sortCache[curPos2]) + { + // 8 + if (getDataAt(++curPos1) == getDataAt(++curPos2)) + { + if (m_sortCache[curPos1] == m_sortCache[curPos2]) + { + curDepth += 8; + if (curDepth >= m_length) + { + // The strings are exactly equal. This can happen for bzip2 when + // we have input such as AAA (only) that does not get run length + // encoded. + break INCLOOP; + } + + // The eight symbols were equals and no cache hits. Continue the inner loop + } + else + { + if (m_sortCache[curPos1] < m_sortCache[curPos2]) + { + break INCLOOP; + } + else + { + swap(j - increment, j); + continue INCLOOP; + } + } + } + else + { + if (getDataAt(curPos1) < getDataAt(curPos2)) + { + break INCLOOP; + } + else + { + swap(j - increment, j); + continue INCLOOP; + } + } + } + else + { + if (m_sortCache[curPos1] < m_sortCache[curPos2]) + { + break INCLOOP; + } + else + { + swap(j - increment, j); + continue INCLOOP; + } + } + } + else + { + if (getDataAt(curPos1) < getDataAt(curPos2)) + { + break INCLOOP; + } + else + { + swap(j - increment, j); + continue INCLOOP; + } + } + } + else + { + if (m_sortCache[curPos1] < m_sortCache[curPos2]) + { + break INCLOOP; + } + else + { + swap(j - increment, j); + continue INCLOOP; + } + } + } + else + { + if (getDataAt(curPos1) < getDataAt(curPos2)) + { + break INCLOOP; + } + else + { + swap(j - increment, j); + continue INCLOOP; + } + } + } + else + { + if (m_sortCache[curPos1] < m_sortCache[curPos2]) + { + break INCLOOP; + } + else + { + swap(j - increment, j); + continue INCLOOP; + } + } + } + else + { + if (getDataAt(curPos1) < getDataAt(curPos2)) + { + break INCLOOP; + } + else + { + swap(j - increment, j); + continue INCLOOP; + } + } + } + else + { + if (m_sortCache[curPos1] < m_sortCache[curPos2]) + { + break INCLOOP; + } + else + { + swap(j - increment, j); + continue INCLOOP; + } + } + } + else + { + if (getDataAt(curPos1) < getDataAt(curPos2)) + { + break INCLOOP; + } + else + { + swap(j - increment, j); + continue INCLOOP; + } + } + } + else + { + if (m_sortCache[curPos1] < m_sortCache[curPos2]) + { + break INCLOOP; + } + else + { + swap(j - increment, j); + continue INCLOOP; + } + } + } + else + { + if (getDataAt(curPos1) < getDataAt(curPos2)) + { + break INCLOOP; + } + else + { + swap(j - increment, j); + continue INCLOOP; + } + } + } + else + { + if (m_sortCache[curPos1] < m_sortCache[curPos2]) + { + break INCLOOP; + } + else + { + swap(j - increment, j); + continue INCLOOP; + } + } + } + else + { + if (getDataAt(curPos1) < getDataAt(curPos2)) + { + break INCLOOP; + } + else + { + swap(j - increment, j); + continue INCLOOP; + } + } + } + else + { + if (m_sortCache[curPos1] < m_sortCache[curPos2]) + { + break INCLOOP; + } + else + { + swap(j - increment, j); + continue INCLOOP; + } + } + } + else + { + if (getDataAt(curPos1) < getDataAt(curPos2)) + { + break INCLOOP; + } + else + { + swap(j - increment, j); + continue INCLOOP; + } + } + } + } + } + } + } + + /** + * Get the index of the string that has the first differing value at the + * given depth compared to the first string in the range. + * @param bucketStartPos The start of the range. + * @param bucketLen The length of the range. + * @param depth The depth to investigate. + * @return The index of the first differing value, or {@code -1} if all + * values are equal at the given depth. + */ + private int getPositionOfFirstDifferingValue(final int bucketStartPos, final int bucketLen, final int depth) + { + assert depth <= DATA_OVERSHOOT; + + final int c0 = getDataAt(m_ptr[bucketStartPos] + depth); + final int upperBound = bucketStartPos + bucketLen; + for (int i = bucketStartPos + 1; i < upperBound; i++) + { + if (getDataAt(m_ptr[i] + depth) != c0) + { + return i; + } + } + // All values at this depth are equal + return -1; + } + + /** + * Swap the {@code len} values after {@code r1Start} with the {@code len} + * values after {@code r2start}. + * @param r1Start The start of the first range. + * @param r2Start The start of the second range. + * @param len The number of bytes to swap. + */ + private void swapRanges(final int r1Start, final int r2Start, final int len) + { + assert r1Start + len <= r2Start; + + // Is the scratchpad's temp area large enough? + if (m_scratchpad.m_tempArea.length < len) + { + // No. Reallocate it + m_scratchpad.m_tempArea = new int[len + 100]; + } + + System.arraycopy(m_ptr, r1Start, m_scratchpad.m_tempArea, 0, len); + System.arraycopy(m_ptr, r2Start, m_ptr, r1Start, len); + System.arraycopy(m_scratchpad.m_tempArea, 0, m_ptr, r2Start, len); + } + + /** + * Add the range to the stack containing ranges that are left to sort. + */ + private void addRangeToStack(final int bucketStartPos, final int bucketLen, final int depth) + { + if (bucketLen < 2) + { + // Already sorted + return; + } + else + { + m_sortStack[++m_sortStackPointer] = new QuickSortRangeInfo(bucketStartPos, bucketLen, depth); + } + } + + /** + * Quicksort the range. + *

+ * This method is declared package-private for the unit tests. + */ + void quickSortRange(final QuickSortRangeInfo qsri) + { + // Select the pivot element. + final int pivot = selectPivot(qsri); + + // Move the pivot into the first position + swap(qsri.m_bucketStartPos, pivot); + + // First check if all characters are equal at the given depth, in which + // case we increase the depth and try again + int sortDepth = qsri.m_depth; + + // The sort depth threshold should be less than the overshoot. If it + // were not, we would have to think of the boundaries of the m_data + // array and such. + assert sortDepth < DATA_OVERSHOOT; + + int posAtFirstDifferingValue = getPositionOfFirstDifferingValue(qsri.m_bucketStartPos, qsri.m_bucketLen, sortDepth); + while (posAtFirstDifferingValue == -1) + { + // All characters at the current depth are equal. Sort using an + // increased depth. + + if (sortDepth == m_length) + { + // We hit the tiles. All strings are equal. + return; + } + else + { + if (++sortDepth < QUICKSORT_DEPTH_THRESHOLD) + { + posAtFirstDifferingValue = getPositionOfFirstDifferingValue(qsri.m_bucketStartPos, qsri.m_bucketLen, sortDepth); + } + else + { + // Use shell sort instead + shellSortRange(qsri); + return; + } + } + } + + // Sort using the calculated depth. + + // Iterate through the data to sort using two pointers advancing + // from each end of the data range to sort. + // Create one area at the start of the range and one at the end of + // the range where we move values that are equal to the pivot value. + int lowPtr = posAtFirstDifferingValue; + // Pointer pointing to the element after the lower pivot range + int lowPivotRangePtr = posAtFirstDifferingValue; + int hiPtr = qsri.m_bucketStartPos + qsri.m_bucketLen - 1; + // Pointer pointing to the element before the upper pivot range. + int hiPivotRangePtr = hiPtr; + int pivotVal = getDataAt(m_ptr[qsri.m_bucketStartPos] + sortDepth); + while (true) + { + int curData; + // Move the lower pointer forward + while (lowPtr <= hiPtr && (curData = getDataAt(m_ptr[lowPtr] + sortDepth)) <= pivotVal) + { + if (curData == pivotVal) + { + // Move the data into the lower pivot range and increase + // the pivot range pointer. + swap(lowPtr, lowPivotRangePtr++); + } + lowPtr++; + } + + // Move the upper pointer backwards + while (lowPtr <= hiPtr && (curData = getDataAt(m_ptr[hiPtr] + sortDepth)) >= pivotVal) + { + if (curData == pivotVal) + { + // Move the data into the upper pivot range and decrease + // the pivot range pointer. + swap(hiPtr, hiPivotRangePtr--); + } + hiPtr--; + } + + if (lowPtr > hiPtr) + { + // We're done + break; + } + + // Now the value at lowPtr is larger than the pivot + // value and the value at hiPtr is smaller. Swap the two + // values and continue moving the pointers. + swap(lowPtr++, hiPtr--); + } + + // Merge and move the two pivot ranges to the center of the array + // and sort the three resulting segments. + + // Swap the smallest possible ranges + final int lowRangeLen = lowPtr - lowPivotRangePtr; + int rlen = Math.min(lowPivotRangePtr - qsri.m_bucketStartPos, lowRangeLen); + if (rlen > 0) + { + swapRanges(qsri.m_bucketStartPos, lowPtr - rlen, rlen); + } + + final int hiRangeLen = hiPivotRangePtr - hiPtr; + rlen = Math.min(qsri.m_bucketStartPos + qsri.m_bucketLen - hiPivotRangePtr - 1, hiRangeLen); + if (rlen > 0) + { + swapRanges(lowPtr, qsri.m_bucketStartPos + qsri.m_bucketLen - rlen, rlen); + } + final int pivotRangeLen = qsri.m_bucketLen - lowRangeLen - hiRangeLen; + + // Sort the lower range + addRangeToStack(qsri.m_bucketStartPos, lowRangeLen, sortDepth); + // Sort the pivot range at an increased depth + addRangeToStack(qsri.m_bucketStartPos + lowRangeLen, pivotRangeLen, sortDepth + 1); + // Sort the higher range + addRangeToStack(qsri.m_bucketStartPos + lowRangeLen + pivotRangeLen, hiRangeLen, sortDepth); + } + + /** + * Sort all strings in the bucket. + *

+ * This method is declared package private for the unit tests. + * @param bucketStartPos The start position of the bucket. + * @param bucketLen The length of the bucket. + * @param depth The depth to start comparing strings at. (The strings are + * all equal at lower depths.) + */ + void sortBucket(final int bucketStartPos, final int bucketLen, final int depth) + { + if (bucketLen < 2) + { + // Already sorted + return; + } + + assert m_sortStackPointer == -1; + + // Use a stack with quick sort pass settings instead of recursing since + // the stack may become very large. + m_sortStack[++m_sortStackPointer] = new QuickSortRangeInfo(bucketStartPos, bucketLen, depth); + while (m_sortStackPointer >= 0) + { + QuickSortRangeInfo qsri = m_sortStack[m_sortStackPointer--]; + + // The minimum length of the segments to sort is 2. That is ensured + // by the addRangeToStack method. + + if ((qsri.m_bucketLen < m_minLengthForQuicksort) || (qsri.m_depth > QUICKSORT_DEPTH_THRESHOLD)) + { + shellSortRange(qsri); + } + else + { + // This adds up to three new sort ranges to the stack + // (values less than, equal to and higher than the pivot value) + quickSortRange(qsri); + } + } + } + + /** + * Calculate the sort order for all big buckets. (256 of them in all, each + * containing 256 small buckets.) + *

+ * Smaller buckets are sorted before larger. This is a more efficient way of + * filling the sort cache. + * @param bucketStartPositions The start positions for all large buckets. + * @return An array containing the indices of the large buckets in the order + * that they should be sorted. + */ + private int[] establishSortOrder(final int[] bucketStartPositions) + { + final int[] sortOrder = m_scratchpad.m_sortOrder; + for (int i = 0; i < 256; i++) + { + sortOrder[i] = i; + } + + // Shell sort the sort orders + // incPtr == 4 gives an increment of 121 + for (int incPtr = 4; incPtr >= 0; incPtr--) + { + final int increment = SHELL_SORT_INCREMENTS[incPtr]; + for (int i = increment; i < sortOrder.length; i++) + { + INCLOOP: for (int j = i; j >= increment; j -= increment) + { + // Which of the lengths of the big buckets is the longest + final int so1 = sortOrder[j - increment]; + final int so2 = sortOrder[j]; + if ((bucketStartPositions[so1 * 256 + 255] - bucketStartPositions[so1 * 256]) > (bucketStartPositions[so2 * 256 + 255] - bucketStartPositions[so2 * 256])) + { + sortOrder[j] = so1; + sortOrder[j - increment] = so2; + } + else + { + // This sort order element is in its right position. + break INCLOOP; + } + } + } + } + + return sortOrder; + } + + /** + * Sort the data. This method borrows optimizations from bzip2 1.0.5. + * @return An array with pointers from each byte's original position to its + * position in the sorted data. + */ + int[] sort() + { + if (m_length == 0) + { + return new int[0]; + } + + // Run a least significant digit radix sort on all two-byte permutations + // of the incoming data. This gives 256^2 buckets with similar data + // which can then be sorted individually. + + // This method call also creates and populates the m_ptr array. + // The bucketStartPositions has an overshoot of one position, which + // gives it the length 65537. The overshoot element should be equal to + // the length of the data. + final int[] bucketStartPositions = radixSort(); + // Fix the overshoot + bucketStartPositions[65536] = m_length; + + final boolean[] sortedLargeBuckets = m_scratchpad.m_sortedLargeBuckets; + Arrays.fill(sortedLargeBuckets, false); + final boolean[] sortedSmallBuckets = m_scratchpad.m_sortedSmallBuckets; + Arrays.fill(sortedSmallBuckets, false); + final int[] copyStart = m_scratchpad.m_copyStart; + final int[] copyEnd = m_scratchpad.m_copyEnd; + + // Establish a sort order for all big buckets (256 of them in all) with + // the shortest buckets coming first. This will make the sort result + // caching optimization most efficient + final int[] sortOrder = establishSortOrder(bucketStartPositions); + + // Quick sort the elements in each non-empty bucket. + for (int largeBucketIndex = 0; largeBucketIndex < 256; largeBucketIndex++) + { + final int largeBucketNo = sortOrder[largeBucketIndex]; + for (int smallBucketNo = 0; smallBucketNo < 256; smallBucketNo++) + { + // Don't sort when smallBucketNo == largeBucketNo. This small + // bucket will be dealt with by the scanning step below. + if (smallBucketNo != largeBucketNo) + { + final int bucketIndex = largeBucketNo * 256 + smallBucketNo; + if (!sortedSmallBuckets[bucketIndex]) + { + final int bucketStartPos = bucketStartPositions[bucketIndex]; + final int bucketLen = bucketStartPositions[bucketIndex + 1] - bucketStartPos; + + if (bucketLen > 1) + { + // More than one data element in this bucket. Sort it. + sortBucket(bucketStartPos, bucketLen, 2); + } + sortedSmallBuckets[bucketIndex] = true; + } + } + } + + // Now that we have sorted all small buckets in the large bucket n, + // we can infer the sorted order for the small bucket n in all + // large buckets m, including (magically) the small bucket n in the + // large bucket n that we did not sort above. + for (int m = 0; m < 256; m++) + { + copyStart[m] = bucketStartPositions[m * 256 + largeBucketNo]; + copyEnd[m] = bucketStartPositions[m * 256 + largeBucketNo + 1] - 1; + } + + for (int i = bucketStartPositions[largeBucketNo * 256]; i < copyStart[largeBucketNo]; i++) + { + int k = m_ptr[i] - 1; + if (k < 0) + { + k += m_length; + } + final int m = getDataAt(k); + if (!sortedLargeBuckets[m]) + { + int index = copyStart[m]++; + if (index >= m_length) + { + index -= m_length; + } + m_ptr[index] = k; + } + } + + for (int i = bucketStartPositions[(largeBucketNo + 1) * 256] - 1; i > copyEnd[largeBucketNo]; i--) + { + int k = m_ptr[i] - 1; + if (k < 0) + { + k += m_length; + } + final int m = getDataAt(k); + if (!sortedLargeBuckets[m]) + { + int index = copyEnd[m]--; + if (index < 0) + { + index += m_length; + } + m_ptr[index] = k; + } + } + + // Mark all buckets that we got for free as sorted + for (int m = 0; m < 256; m++) + { + sortedSmallBuckets[m * 256 + largeBucketNo] = true; + } + + sortedLargeBuckets[largeBucketNo] = true; + + // Fix the sort cache for the large bucket. + // Don't do it for the last sorted bucket. + if (largeBucketIndex != 255) + { + final int largeBucketStart = bucketStartPositions[largeBucketNo * 256]; + final int largeBucketEnd; + if (largeBucketNo < 255) + { + largeBucketEnd = bucketStartPositions[(largeBucketNo + 1) * 256]; + } + else + { + largeBucketEnd = m_length; + } + final int largeBucketSize = largeBucketEnd - largeBucketStart; + assert largeBucketSize >= 0; + + int shifts = 0; + while (largeBucketSize >>> shifts > 65534) + { + shifts++; + } + + for (int i = largeBucketSize - 1; i >= 0; i--) + { + final int sptr = m_ptr[largeBucketStart + i]; + final int qval = i >>> shifts; + m_sortCache[sptr] = qval; + if (sptr < DATA_OVERSHOOT) + { + // Update cache in overshoot too + m_sortCache[m_length + sptr] = qval; + } + } + } + } + return m_ptr; + } +} diff --git a/src/main/java/org/at4j/comp/bzip2/package-info.java b/src/main/java/org/at4j/comp/bzip2/package-info.java new file mode 100644 index 0000000..abb3967 --- /dev/null +++ b/src/main/java/org/at4j/comp/bzip2/package-info.java @@ -0,0 +1,29 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +/** + * This package contains the {@link org.at4j.comp.bzip2.BZip2InputStream} and + * {@link org.at4j.comp.bzip2.BZip2OutputStream} stream implementations for + * decompressing and compressing data. The + * {@link org.at4j.comp.bzip2.BZip2ReadableFile} and + * {@link org.at4j.comp.bzip2.BZip2WritableFile} can be used to transparently + * decompress and compress data in files. + * @since 1.0 + * @author Karl Gustafsson + */ +package org.at4j.comp.bzip2; \ No newline at end of file diff --git a/src/main/java/org/at4j/comp/package-info.java b/src/main/java/org/at4j/comp/package-info.java new file mode 100644 index 0000000..11ef1d3 --- /dev/null +++ b/src/main/java/org/at4j/comp/package-info.java @@ -0,0 +1,24 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +/** + * The {@link org.at4j.comp.CompressionLevel} enum. + * @since 1.0 + * @author Karl Gustafsson + */ +package org.at4j.comp; \ No newline at end of file diff --git a/src/main/java/org/at4j/support/comp/ByteMoveToFront.java b/src/main/java/org/at4j/support/comp/ByteMoveToFront.java new file mode 100644 index 0000000..85dc55f --- /dev/null +++ b/src/main/java/org/at4j/support/comp/ByteMoveToFront.java @@ -0,0 +1,173 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.support.comp; + +/** + * A move-to-front (MTF) encoder and decoder for bytes. For more information on + * MTF encoding, seethe + * Wikipedia article on move-to-front transforms. + *

+ * This object is not thread safe. Clients must provide external synchronization + * if they are to use it from several concurrent threads. + * @author Karl Gustafsson + * @since 1.1 + * @see IntMoveToFront + */ +public class ByteMoveToFront +{ + private final byte[] m_alphabet; + + private static byte[] createByteAlphabetFromRange(int minVal, int maxVal) throws IndexOutOfBoundsException + { + if ((minVal < 0) || (maxVal > 255) || (minVal >= maxVal)) + { + throw new IndexOutOfBoundsException("Invalid min and/or max value: min " + minVal + ", max " + maxVal); + } + int alphLen = maxVal - minVal + 1; + byte[] alphabet = new byte[alphLen]; + for (int i = 0; i < alphLen; i++) + { + alphabet[i] = (byte) ((i + minVal) & 0xFF); + } + return alphabet; + } + + /** + * Create a byte MTF encoder/decoder that transforms bytes in the range + * between {@code minValue} and {@code maxValue}. + *

+ * The initial alphabet of the transformer will be {@code minValue … + * maxValue}. + * @param minValue The start value of the range. This should be an unsigned + * byte in the range 0 to 254. + * @param maxValue The end value of the range. This should be an unsigned + * byte in the range 1 to 255. + * @throws IndexOutOfBoundsException If the min and/or the max values are + * not unsigned bytes or if the min value is equal to or greater than the + * max value. + */ + public ByteMoveToFront(int minValue, int maxValue) throws IndexOutOfBoundsException + { + this(createByteAlphabetFromRange(minValue, maxValue)); + } + + /** + * Create a byte MTF encoder/decoder that transforms bytes using the + * supplied initial alphabet. + * @param alphabet The initial alphabet. This byte array is not + * copied by this method and it will be modified by encoding or decoding + * operations. + */ + public ByteMoveToFront(byte[] alphabet) + { + // Null check + alphabet.getClass(); + + m_alphabet = alphabet; + } + + /** + * Encode the bytes in {@code in} and store them in the array {@code out}. + * The MTF alphabet is also updated by this method. + * @param in The bytes to encode. + * @param out The array to store the encoded bytes in. This array must be at + * least as long as {@code in}. + * @return {@code out} + * @throws ArrayIndexOutOfBoundsException If any of the bytes in {@code in} + * are not in the MTF alphabet. + * @throws IllegalArgumentException If the {@code out} array is too short. + */ + public byte[] encode(byte[] in, byte[] out) throws ArrayIndexOutOfBoundsException, IllegalArgumentException + { + if (out.length < in.length) + { + throw new IllegalArgumentException("The output array must be at least of the same length as the input array. Was in: " + in.length + ", out: " + out.length); + } + + for (int i = 0; i < in.length; i++) + { + byte val = in[i]; + if (m_alphabet[0] == val) + { + out[i] = 0; + } + else + { + byte prev = m_alphabet[0]; + int j = 1; + while (true) + { + byte nextPrev = m_alphabet[j]; + if (m_alphabet[j] == val) + { + out[i] = (byte) (j & 0xFF); + m_alphabet[0] = m_alphabet[j]; + m_alphabet[j] = prev; + break; + } + m_alphabet[j] = prev; + prev = nextPrev; + j++; + } + } + } + return out; + } + + /** + * Decode a single byte and update the MTF alphabet. + * @param index The index in the MTF alphabet for the byte. + * @return The byte. + */ + public byte decode(int index) + { + byte val = m_alphabet[index]; + for (int j = index; j > 0; j--) + { + m_alphabet[j] = m_alphabet[j - 1]; + } + m_alphabet[0] = val; + return val; + } + + /** + * Decode an array of bytes and update the MTF alphabet. The decoded bytes + * are stored in {@code out}. + * @param in The bytes to decode. + * @param out The array to store the decoded bytes in. This array must be at + * least as long as {@code in}. + * @return {@code out} + * @throws ArrayIndexOutOfBoundsException If any of the bytes in {@code in} + * are not in the MTF alphabet. + * @throws IllegalArgumentException If {@code out} is too short. + */ + public byte[] decode(byte[] in, byte[] out) throws ArrayIndexOutOfBoundsException, IllegalArgumentException + { + if (out.length < in.length) + { + throw new IllegalArgumentException("The output array must be at least of the same length as the input array. Was in: " + in.length + ", out: " + out.length); + } + + for (int i = 0; i < in.length; i++) + { + out[i] = decode(in[i]); + } + return out; + } +} diff --git a/src/main/java/org/at4j/support/comp/IntMoveToFront.java b/src/main/java/org/at4j/support/comp/IntMoveToFront.java new file mode 100644 index 0000000..6a4084f --- /dev/null +++ b/src/main/java/org/at4j/support/comp/IntMoveToFront.java @@ -0,0 +1,177 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.support.comp; + +/** + * A move-to-front (MTF) encoder and decoder for integers. For more information + * on MTF encoding, seethe + * Wikipedia article on move-to-front transforms. + *

+ * This object is not thread safe. Clients must provide external synchronization + * if they are to use it from several concurrent threads. + * @author Karl Gustafsson + * @since 1.1 + * @see ByteMoveToFront + */ +public class IntMoveToFront +{ + private final int[] m_alphabet; + + private static int[] createIntAlphabetFromRange(int minVal, int maxVal) throws IndexOutOfBoundsException + { + if (minVal >= maxVal) + { + throw new IndexOutOfBoundsException("Invalid min and max values. Min=" + minVal + ", max=" + maxVal); + } + int alphLen = maxVal - minVal + 1; + int[] alphabet = new int[alphLen]; + for (int i = 0; i < alphLen; i++) + { + alphabet[i] = i + minVal; + } + return alphabet; + } + + /** + * Create a byte MTF encoder/decoder that transforms integers in the range + * between {@code minValue} and {@code maxValue}. + *

+ * The initial alphabet of the transformer will be {@code minValue … + * maxValue}. + * @param minValue The start value of the range. + * @param maxValue The end value of the range. + * @throws IndexOutOfBoundsException If the min value is equal to or greater + * than the max value. + */ + public IntMoveToFront(int minValue, int maxValue) throws IndexOutOfBoundsException + { + this(createIntAlphabetFromRange(minValue, maxValue)); + } + + /** + * Create a byte MTF encoder/decoder that transforms integers using the + * supplied initial alphabet. + * @param alphabet The initial alphabet. This integer array is not + * copied by this method and it will be modified by encoding or decoding + * operations. + */ + public IntMoveToFront(int[] alphabet) + { + // Null check + alphabet.getClass(); + + m_alphabet = alphabet; + } + + /** + * Encode the integers in {@code in} and store them in the array {@code out} + * . The MTF alphabet is also updated by this method. + * @param in The integers to encode. + * @param out The array to store the encoded integers in. This array must be + * at least as long as {@code in}. + * @return {@code out} + * @throws ArrayIndexOutOfBoundsException If any of the integers in {@code + * in} are not in the MTF alphabet. + * @throws IllegalArgumentException If the {@code out} array is too short. + */ + public int[] encode(int[] in, int[] out) throws ArrayIndexOutOfBoundsException, IllegalArgumentException + { + if (out.length < in.length) + { + throw new IllegalArgumentException("The output array must be at least of the same length as the input array. Was in: " + in.length + ", out: " + out.length); + } + + for (int i = 0; i < in.length; i++) + { + int val = in[i]; + if (m_alphabet[0] == val) + { + out[i] = 0; + } + else + { + int prev = m_alphabet[0]; + int j = 1; + while (true) + { + int nextPrev = m_alphabet[j]; + if (m_alphabet[j] == val) + { + out[i] = (byte) (j & 0xFF); + m_alphabet[0] = m_alphabet[j]; + m_alphabet[j] = prev; + break; + } + m_alphabet[j] = prev; + prev = nextPrev; + j++; + } + } + } + return out; + } + + /** + * Decode a single integer and update the MTF alphabet. + * @param index The index in the MTF alphabet for the integer. + * @return The integer. + */ + public int decode(int index) + { + int val = m_alphabet[index]; + for (int j = index; j > 0; j--) + { + m_alphabet[j] = m_alphabet[j - 1]; + } + m_alphabet[0] = val; + return val; + } + + /** + * Decode an array of integers and update the MTF alphabet. The decoded + * integers are stored in {@code out}. + * @param in The integers to decode. + * @param out The array to store the decoded integers in. This array must be + * at least as long as {@code in}. + * @return {@code out} + * @throws ArrayIndexOutOfBoundsException If any of the integers in {@code + * in} are not in the MTF alphabet. + * @throws IllegalArgumentException If {@code out} is too short. + */ + public int[] decode(int[] in, int[] out) throws ArrayIndexOutOfBoundsException, IllegalArgumentException + { + if (out.length < in.length) + { + throw new IllegalArgumentException("The output array must be at least of the same length as the input array. Was in: " + in.length + ", out: " + out.length); + } + + for (int i = 0; i < in.length; i++) + { + int index = in[i]; + int val = m_alphabet[index]; + for (int j = index; j > 0; j--) + { + m_alphabet[j] = m_alphabet[j - 1]; + } + m_alphabet[0] = val; + out[i] = val; + } + return out; + } +} diff --git a/src/main/java/org/at4j/support/comp/package-info.java b/src/main/java/org/at4j/support/comp/package-info.java new file mode 100644 index 0000000..74a783f --- /dev/null +++ b/src/main/java/org/at4j/support/comp/package-info.java @@ -0,0 +1,24 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +/** + * Support classes for compression and decompression. + * @since 1.1 + * @author Karl Gustafsson + */ +package org.at4j.support.comp; \ No newline at end of file diff --git a/src/main/java/org/at4j/support/io/BitInput.java b/src/main/java/org/at4j/support/io/BitInput.java new file mode 100644 index 0000000..4bffd38 --- /dev/null +++ b/src/main/java/org/at4j/support/io/BitInput.java @@ -0,0 +1,180 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.support.io; + +import java.io.Closeable; +import java.io.IOException; + +/** + * This interface identifies a source for bits. + *

+ * The source is assumed to have a position which may or may not be at a byte + * boundary (every eight bits). + *

+ * If an implementing class also extends {@link java.io.InputStream} it can be + * used as an input stream. This interface redefines {@link java.io.InputStream} + * 's read methods with the extra condition that they may only be used if the + * current position of the source is at a byte boundary. The + * {@link #readBytes(byte[], int, int)} method does not have that limitation. + * @author Karl Gustafsson + * @since 1.1 + * @see java.io.InputStream + * @see BitOutput + */ +public interface BitInput extends Closeable +{ + /** + * Has the input come to its end? If so, nothing more can be read from it. + * @return {@code true} if no more can be read from this input. + */ + boolean isAtEof(); + + /** + * Move the position to the next byte boundary. If the current position is + * already at a byte boundary, this method does nothing. + * @throws IOException On I/O errors or if this input is already at the end + * of the available data. + */ + void skipToByteBoundary() throws IOException; + + /** + * Read the value of the next bit in the stream. + * @return {@code true} if the value is 1, {@code false} if it is 0. + * @throws IOException On I/O errors or if this input is already at the end + * of the available data. + */ + boolean readBit() throws IOException; + + /** + * Read up to eight bits from the input. + * @param no The number of bits to read. + * @return The bits as the least significant bits of the returned integer. + * For instance, if {@code 1011} is read, the returned integer will have the + * value {@code 1 * 8 + 0 * 4 + 1 * 2 + 1 * 1 == 11}. + * @throws IndexOutOfBoundsException If {@code no} is less than 0 or greater + * than 8. + * @throws IOException On I/O errors or if this input is already at the end + * of the available data. + * @see #readBitsLittleEndian(int) + */ + int readBits(int no) throws IndexOutOfBoundsException, IOException; + + /** + * Read up to 32 bits from the input. The first eight bits that is read will + * be the most significant byte of the returned integer. + * @param no The number of bits to read. + * @return The bits read as the least significant bits of the returned + * integer. (Just like for {@link #readBits(int)}. + * @throws IndexOutOfBoundsException If {@code no} is less than 0 or greater + * than 32. + * @throws IOException On I/O errors or if this input is already at the end + * of the available data. + * @see #readBits(int) + */ + int readBitsLittleEndian(int no) throws IndexOutOfBoundsException, IOException; + + /** + * Read bytes from the input. Unlike {@link #read(byte[], int, int)}, this + * method does not require that the current position is at a byte boundary. + *

+ * Another difference to {@link #read(byte[], int, int)} is that this method + * throws an {@link IOException} if it cannot read all requested bytes. + * @param barr The byte array to read bytes into. + * @param off The offset in the array to start writing read bytes at. + * @param len The number of bytes to read. + * @return {@code barr}. + * @throws IndexOutOfBoundsException If the length or the offset is negative + * or if the sum of the length and the offset is greater than the length of + * the supplied byte array. + * @throws IOException On I/O errors or if there was not enough bytes to + * read from the input. + * @see #read(byte[], int, int) + */ + public byte[] readBytes(byte[] barr, int off, int len) throws IndexOutOfBoundsException, IOException; + + /** + * Read a single byte from the input. See {@link java.io.InputStream#read()} + * . + *

+ * This method requires that the current position in the input is at a byte + * boundary. + * @return The read byte or {@code -1} if the current position is at the end + * of the input. + * @throws IOException On I/O errors or if the current position is not at a + * byte boundary. + * @see java.io.InputStream#read() + */ + int read() throws IOException; + + /** + * Read bytes into the supplied array. See + * {@link java.io.InputStream#read(byte[])}. + *

+ * This method requires that the current position in the input is at a byte + * boundary. + * @param barr The byte array to read bytes into. + * @return The number of bytes read. + * @throws IOException On I/O errors or if the current position is not at a + * byte boundary. + * @see java.io.InputStream#read(byte[]) + */ + int read(byte[] barr) throws IOException; + + /** + * Read bytes into the supplied array. See + * {@link java.io.InputStream#read(byte[], int, int)}. + *

+ * This method requires that the current position in the input is at a byte + * boundary. + * @param barr The byte array to read bytes into. + * @param offset The offset position in the array to start write read bytes + * to. + * @param len The number of bytes to read. + * @return The number of bytes actually read. + * @throws IndexOutOfBoundsException If the offset or the length is negative + * or if the sum of the offset and the length is greater than the length of + * the supplied byte array. + * @throws IOException On I/O errors or if the current position is not at a + * byte boundary. + */ + int read(byte[] barr, int offset, int len) throws IndexOutOfBoundsException, IOException; + + /** + * Skip bytes in the input. See {@link java.io.InputStream#skip(long)}. + *

+ * This method requires that the current position in the input is at a byte + * boundary. + * @param n The number of bytes to skip. + * @return The number of bytes skipped. + * @throws IOException On I/O errors or if the current position is not at a + * byte boundary. + */ + long skip(long n) throws IOException; + + /** + * Get the number of bytes available in the input. See + * {@link java.io.InputStream#available()}. + *

+ * This method requires that the current position in the input is at a byte + * boundary. + * @throws IOException On I/O errors or if the current position is not at a + * byte boundary. + */ + int available() throws IOException; +} diff --git a/src/main/java/org/at4j/support/io/BitOutput.java b/src/main/java/org/at4j/support/io/BitOutput.java new file mode 100644 index 0000000..637d37f --- /dev/null +++ b/src/main/java/org/at4j/support/io/BitOutput.java @@ -0,0 +1,162 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.support.io; + +import java.io.Closeable; +import java.io.IOException; + +/** + * This interface identifies a sink for bits. + *

+ * The sink is assumed to have a position which may or may not be at a byte + * boundary (every eight bits). + *

+ * If an implementing class also extends {@link java.io.OutputStream} it can be + * used as an output stream. This interface redefines + * {@link java.io.OutputStream}'s write methods with the extra condition that + * they may only be used if the current position of the sink is at a byte + * boundary. The {@link #writeBytes(byte[], int, int)} method does not have that + * limitation. + * @author Karl Gustafsson + * @since 1.1 + * @see java.io.OutputStream + * @see BitInput + */ +public interface BitOutput extends Closeable +{ + /** + * Pad the output with zeroes to the next byte boundary. If the current + * position is already at a byte boundary, this method does nothing. + * @throws IOException On I/O errors. + */ + void padToByteBoundary() throws IOException; + + /** + * Get the value of the unfinished byte. The value is shifted so that the + * least significant bit positions are used. + * {@link #getNumberOfBitsInUnfinishedByte()} returns how many bit positions + * that are used. + *

+ * If the current position is at a byte boundary, 0 is returned. + * @return The value of the unfinished byte. + */ + int getUnfinishedByte(); + + /** + * Get the number of bits that have been written to the last byte. + *

+ * If the current position is at a byte boundary, 0 is returned. + * @return The number of bits that have been written to the last byte. This + * is a number between 0 and 7 (inclusive). + */ + int getNumberOfBitsInUnfinishedByte(); + + /** + * Write a single bit. + * @param val The bit ({@code true == 1}, {@code false == 0}). + * @throws IOException On I/O errors. + */ + void writeBit(boolean val) throws IOException; + + /** + * Write up to eight bits. + * @param val The value to write. The bits written are the {@code no} + * rightmost bits of {@code val}. It is not verified that {@code val} fits + * within its {@code no} rightmost bits. If it does not, the written value + * is simply truncated. + * @param no The number of bits to write. This must be between 0 and 8 + * (inclusive). + * @throws IndexOutOfBoundsException If {@code no} is less than 0 or greater + * than 8. + * @throws IOException On I/O errors + * @see #writeBitsLittleEndian(int, int) + */ + void writeBits(int val, int no) throws IndexOutOfBoundsException, IOException; + + /** + * Write up to 32 bits. The bits are written little endian with the most + * significant bit first. + * @param val The value to write. The bits written are the {@code no} + * rightmost bits of {@code val}. It is not verified that {@code val} fits + * within its {@code no} rightmost bits. If it does not, the written value + * is simply truncated. + * @param no The number of bits to write. This must be between 0 and 32 + * (inclusive) + * @throws IndexOutOfBoundsException If {@code no} is less than 0 or more + * than 32. + * @throws IOException On I/O errors. + * @see #writeBits(int, int) + */ + void writeBitsLittleEndian(int val, int no) throws IndexOutOfBoundsException, IOException; + + /** + * Write an array of bytes to the output. Unlike + * {@link #write(byte[], int, int)}, this method does not require that the + * current position is at a byte boundary. + * @param barr The bytes to write. + * @param off The offset in the byte array. + * @param len The number of bytes to write. + * @throws IndexOutOfBoundsException If the offset or the length is negative + * or if the offset + length is larger than the byte array. + * @throws IOException On I/O errors + * @see #write(byte[], int, int) + */ + void writeBytes(byte[] barr, int off, int len) throws IndexOutOfBoundsException, IOException; + + /** + * See {@link java.io.OutputStream#write(int)}. + *

+ * This method requires that the current position of the output is at a byte + * boundary. + * @param b The byte to write (0 - 255). + * @throws IOException On I/O errors or if the current position is not at a + * byte boundary. + * @see java.io.OutputStream#write(int) + */ + void write(int b) throws IOException; + + /** + * See {@link java.io.OutputStream#write(byte[])}. + *

+ * This method requires that the current position of the output is at a byte + * boundary. + * @param barr The bytes to write. + * @throws IOException On I/O errors or if the current position is not at a + * byte boundary. + * @see java.io.OutputStream#write(byte[]) + */ + void write(byte[] barr) throws IOException; + + /** + * See {@link java.io.OutputStream#write(byte[], int, int)}. + *

+ * This method requires that the current position of the output is at a byte + * boundary. + * @param barr The bytes to write. + * @param off The offset in the byte array. + * @param len The number of bytes to write. + * @throws IndexOutOfBoundsException If the offset or the length is negative + * or if the offset + length is larger than the byte array. + * @throws IOException On I/O errors or if the current position is not at a + * byte boundary. + * @see java.io.OutputStream#write(byte[], int, int) + * @see #writeBytes(byte[], int, int) + */ + void write(byte[] barr, int off, int len) throws IndexOutOfBoundsException, IOException; +} diff --git a/src/main/java/org/at4j/support/io/LittleEndianBitInputStream.java b/src/main/java/org/at4j/support/io/LittleEndianBitInputStream.java new file mode 100644 index 0000000..39ca770 --- /dev/null +++ b/src/main/java/org/at4j/support/io/LittleEndianBitInputStream.java @@ -0,0 +1,380 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.support.io; + +import java.io.IOException; +import java.io.InputStream; + +/** + * This is an input stream that a client can use to read single or several bits + * from an underlying {@link InputStream}. The bits are read in little-endian + * bit order. + * @author Karl Gustafsson + * @since 1.1 + */ +public class LittleEndianBitInputStream extends InputStream implements BitInput +{ + // 2^0 + private static final int POINTER_START_OF_BYTE = 0; + // 2^7 + private static final int POINTER_END_OF_BYTE = 7; + + private final InputStream m_in; + + // The current byte + private int m_curByte; + // The pointer to the current bit location in the current byte. + private int m_pointerInByte = POINTER_START_OF_BYTE; + + private long m_numberOfBytesRead = 0; + + public LittleEndianBitInputStream(InputStream in) throws IOException + { + // Null check + in.getClass(); + + m_in = in; + m_curByte = in.read(); + // Don't increment the number of read bytes counter. It is always one + // byte behind. + } + + private int readByte() throws IOException + { + int res = m_in.read(); + m_numberOfBytesRead += res != -1 ? 1 : 0; + return res; + } + + private void incrementPointerPosition() throws IOException + { + if (m_pointerInByte == POINTER_END_OF_BYTE) + { + // Read a new byte + m_curByte = readByte(); + m_pointerInByte = POINTER_START_OF_BYTE; + } + else + { + // Increment the pointer only if we're not at EOF + if (!isAtEof()) + { + m_pointerInByte++; + } + } + } + + public boolean isAtEof() + { + return m_curByte == -1; + } + + /** + * Get the number of whole bytes read this far. + * @return The number of bytes read this far. + */ + public long getNumberOfBytesRead() + { + return m_numberOfBytesRead; + } + + private void assertNotAtEOF() throws IOException + { + if (isAtEof()) + { + throwIOException("At EOF"); + } + } + + private boolean isAtByteBoundary() + { + return m_pointerInByte == POINTER_START_OF_BYTE; + } + + private void assertAtByteBoundary() throws IOException + { + if (!isAtByteBoundary()) + { + throwIOException("Not at byte boundary. Position: pos=" + m_pointerInByte); + } + } + + private void throwIOException(String msg, long pos) throws IOException + { + throw new IOException(msg + ". Position in stream: " + pos); + } + + private void throwIOException(String msg) throws IOException + { + throw new IOException(msg + ". Position in stream: " + m_numberOfBytesRead); + } + + public void skipToByteBoundary() throws IOException + { + assertNotAtEOF(); + if (m_pointerInByte != POINTER_START_OF_BYTE) + { + m_pointerInByte = POINTER_START_OF_BYTE; + m_curByte = readByte(); + } + } + + public boolean readBit() throws IOException + { + assertNotAtEOF(); + boolean res = (m_curByte & (1 << (7 - m_pointerInByte))) > 0; + incrementPointerPosition(); + return res; + } + + public int readBits(int no) throws IOException, IndexOutOfBoundsException + { + if (no < 0 || no > 8) + { + throw new IndexOutOfBoundsException("Invalid number of bits: " + no + ". Must be between 0 and 8 (inclusive)"); + } + assertNotAtEOF(); + + if (no == 0) + { + return 0; + } + + // Bytes are stored little bit endian + if (no + m_pointerInByte <= 8) + { + // All bits to read fit in the current byte + int res = (m_curByte >> (8 - no - m_pointerInByte)) & ((1 << no) - 1); + m_pointerInByte += no; + if (m_pointerInByte > POINTER_END_OF_BYTE) + { + m_curByte = readByte(); + m_pointerInByte = POINTER_START_OF_BYTE; + } + return res; + } + else + { + // Read remaining bits + first bits of next byte + int noToReadInByte2 = no - (8 - m_pointerInByte); + int res = (m_curByte & ((1 << (8 - m_pointerInByte)) - 1)) << noToReadInByte2; + m_curByte = readByte(); + assertNotAtEOF(); + m_pointerInByte = noToReadInByte2; + res += m_curByte >> (8 - noToReadInByte2); + return res; + } + } + + public int readBitsLittleEndian(int no) throws IOException, IndexOutOfBoundsException + { + if (no < 0 || no > 32) + { + throw new IndexOutOfBoundsException("Invalid number of bits: " + no + ". Must be between 0 and 32 (inclusive)"); + } + + if (no == 0) + { + return 0; + } + + int noReads = no / 8; + int mod = no % 8; + int res = 0; + if (mod != 0) + { + res = readBits(mod) << (noReads * 8); + } + for (int i = 0; i < noReads; i++) + { + res += readBits(8) << ((noReads - i - 1) * 8); + } + return res; + } + + public byte[] readBytes(byte[] barr, int off, int len) throws IOException, IndexOutOfBoundsException + { + if (off < 0) + { + throw new IndexOutOfBoundsException("Invalid offset " + off + ". It must be >= 0"); + } + if (len < 0) + { + throw new IndexOutOfBoundsException("Invalid length " + len + ". It must be >= 0"); + } + if (off + len > barr.length) + { + throw new IndexOutOfBoundsException("Invalid offset + length (" + off + " + " + len + "). It must be <= the length of the supplied array (" + barr.length + ")"); + } + + assertNotAtEOF(); + + if (len == 0) + { + return barr; + } + + if (isAtByteBoundary()) + { + // Special case: we are at the byte boundary. We just have to read + // the len next bytes and return them. + // The read method takes care of updating all internal state. + int noRead = read(barr, off, len); + if (noRead != len) + { + throwIOException("Unexpected EOF. Wanted to read " + len + " bytes. Got " + noRead, m_numberOfBytesRead - noRead); + } + } + else + { + int noRead = m_in.read(barr, off, len); + m_numberOfBytesRead += noRead; + if (noRead != len) + { + m_curByte = -1; + m_pointerInByte = POINTER_START_OF_BYTE; + throwIOException("Unexpected EOF. Wanted to read " + len + " bytes. Got " + noRead, m_numberOfBytesRead - noRead); + } + + // Shift bytes in the result array. Bytes are stored little (bit-) + // endian. + int lastByte = m_curByte; + m_curByte = barr[off + len - 1] & 0xFF; + // The distance to shift the second byte to the right. + int rightShiftDistance = 8 - m_pointerInByte; + for (int i = off; i < off + len; i++) + { + int newLastByte = barr[i]; + barr[i] = (byte) (((lastByte << m_pointerInByte) | ((barr[i] & 0xFF) >>> rightShiftDistance)) & 0xFF); + lastByte = newLastByte; + } + } + return barr; + } + + @Override + public int read() throws IOException + { + assertAtByteBoundary(); + int res = m_curByte; + if (m_curByte != -1) + { + m_curByte = readByte(); + } + return res; + } + + @Override + public int read(byte[] barr) throws IOException + { + return read(barr, 0, barr.length); + } + + @Override + public int read(byte[] barr, int offset, int len) throws IndexOutOfBoundsException, IOException + { + if (offset < 0) + { + throw new IndexOutOfBoundsException("Illegal offset: " + offset); + } + else if (len < 0) + { + throw new IndexOutOfBoundsException("Illegal length: " + len); + } + else if ((offset + len) > barr.length) + { + throw new IndexOutOfBoundsException("Illegal offset + length: " + offset + " + " + len + ". Longer than the byte array: " + barr.length); + } + + assertAtByteBoundary(); + if (isAtEof()) + { + return -1; + } + else + { + barr[offset] = (byte) m_curByte; + int res = 1; + if (len > 1) + { + int noRead = m_in.read(barr, offset + 1, len - 1); + if (noRead > 0) + { + res += noRead; + m_numberOfBytesRead += noRead; + } + } + m_curByte = readByte(); + return res; + } + } + + @Override + public long skip(long n) throws IOException + { + assertAtByteBoundary(); + if (n <= 0L) + { + return 0L; + } + else + { + if (isAtEof()) + { + return 0L; + } + + if (n > 1L) + { + long noToSkip = n - 1L; + long noSkipped = m_in.skip(noToSkip); + m_numberOfBytesRead += noSkipped; + if (noSkipped < noToSkip) + { + // At EOF + m_curByte = -1; + return noSkipped + 1; + } + else + { + m_curByte = readByte(); + return noSkipped + 1; + } + } + else + { + m_curByte = readByte(); + return 1L; + } + } + } + + @Override + public int available() throws IOException + { + assertAtByteBoundary(); + return m_in.available() + m_curByte != -1 ? 1 : 0; + } + + @Override + public void close() throws IOException + { + m_in.close(); + } +} diff --git a/src/main/java/org/at4j/support/io/LittleEndianBitOutputStream.java b/src/main/java/org/at4j/support/io/LittleEndianBitOutputStream.java new file mode 100644 index 0000000..7b5d313 --- /dev/null +++ b/src/main/java/org/at4j/support/io/LittleEndianBitOutputStream.java @@ -0,0 +1,265 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.support.io; + +import java.io.IOException; +import java.io.OutputStream; + +/** + * This is an {@link OutputStream} that implements {@link BitOutput} and hence + * can be used to write individual bits to the output. The bits are stored in + * little-endian order. + * @author Karl Gustafsson + * @since 1.1 + */ +public class LittleEndianBitOutputStream extends OutputStream implements BitOutput +{ + // 2^0 + private static final int POINTER_START_OF_BYTE = 0; + // 2^7 + private static final int POINTER_END_OF_BYTE = 7; + + private final OutputStream m_out; + + // The current byte + private int m_curByte = 0; + // The pointer to the current bit location in the current byte. + private int m_pointerInByte = POINTER_START_OF_BYTE; + + private long m_numberOfBytesWritten = 0; + + public LittleEndianBitOutputStream(OutputStream wrapped) + { + // Null check + wrapped.getClass(); + + m_out = wrapped; + } + + private boolean isAtByteBoundary() + { + return m_pointerInByte == POINTER_START_OF_BYTE; + } + + private void assertAtByteBoundary() throws IOException + { + if (!isAtByteBoundary()) + { + throwIOException("Not at byte boundary. Position: pos=" + m_pointerInByte); + } + } + + private void throwIOException(String msg) throws IOException + { + throw new IOException(msg + ". Position in stream: " + m_numberOfBytesWritten); + } + + private void writeCurByte() throws IOException + { + m_out.write(m_curByte); + m_numberOfBytesWritten++; + m_pointerInByte = POINTER_START_OF_BYTE; + m_curByte = 0; + } + + /** + * Get the total number of whole bytes written by this stream so far. + * @return The number of whole bytes written. + */ + public long getNumberOfBytesWritten() + { + return m_numberOfBytesWritten; + } + + public int getUnfinishedByte() + { + return m_pointerInByte > 0 ? m_curByte >>> (7 - (m_pointerInByte - 1)) : 0; + } + + public int getNumberOfBitsInUnfinishedByte() + { + return m_pointerInByte; + } + + public void padToByteBoundary() throws IOException + { + if (m_pointerInByte > POINTER_START_OF_BYTE) + { + writeCurByte(); + } + } + + public void writeBit(boolean val) throws IOException + { + if (val) + { + m_curByte = m_curByte | 1 << (7 - m_pointerInByte); + } + m_pointerInByte++; + + if (m_pointerInByte > POINTER_END_OF_BYTE) + { + // Write the current byte and start a new one + writeCurByte(); + } + } + + public void writeBits(int val, int no) throws IOException, IndexOutOfBoundsException + { + if (no < 0 || no > 8) + { + throw new IndexOutOfBoundsException("Invalid number of bits " + no + ". Must be between 0 and 8 (inclusive)"); + } + + if (no == 0) + { + return; + } + + if (m_pointerInByte + no <= 8) + { + // All bits to write fit in the current byte + m_curByte = m_curByte | ((val & ((1 << no) - 1)) << (8 - m_pointerInByte - no)); + m_pointerInByte += no; + if (m_pointerInByte > POINTER_END_OF_BYTE) + { + writeCurByte(); + } + } + else + { + // Bits will have to be written in the next byte too + int bitsToWriteInCurByte = 8 - m_pointerInByte; + int bitsToWriteInNextByte = no - bitsToWriteInCurByte; + m_curByte = m_curByte | (val >>> (no - bitsToWriteInCurByte)); + writeCurByte(); + m_curByte = (val & ((1 << bitsToWriteInNextByte) - 1)) << (8 - bitsToWriteInNextByte); + m_pointerInByte = bitsToWriteInNextByte; + } + } + + public void writeBitsLittleEndian(int val, int no) throws IndexOutOfBoundsException, IOException + { + if (no < 0 || no > 32) + { + throw new IndexOutOfBoundsException("Invalid number of bits to write " + no + ". It must be between 0 and 32 (inclusive)"); + } + + if (no == 0) + { + return; + } + + int noWrites = no / 8; + int mod = no % 8; + if (mod != 0) + { + writeBits(val >>> (noWrites * 8), mod); + } + for (int i = 0; i < noWrites; i++) + { + writeBits(val >>> ((noWrites - i - 1) * 8), 8); + } + } + + public void writeBytes(byte[] barr, int off, int len) throws IndexOutOfBoundsException, IOException + { + if (off < 0) + { + throw new IndexOutOfBoundsException("Invalid offset " + off + ". It must be >= 0"); + } + if (len < 0) + { + throw new IndexOutOfBoundsException("Invalid length " + len + ". It must be >= 0"); + } + if (off + len > barr.length) + { + throw new IndexOutOfBoundsException("Invalid offset + length (" + off + " + " + len + "). It must be <= the length of the supplied array (" + barr.length + ")"); + } + + if (len == 0) + { + return; + } + + if (isAtByteBoundary()) + { + // Special case + m_out.write(barr, off, len); + m_numberOfBytesWritten += len; + } + else + { + // Copy the bytes to write to a new array. We cannot modify barr, + // even if it is tempting. + byte[] toWrite = new byte[len]; + System.arraycopy(barr, off, toWrite, 0, len); + + int prevByte = m_curByte; + int leftShiftDistance = 8 - m_pointerInByte; + for (int i = 0; i < len; i++) + { + // Shift in bits from the previous byte and shift out bytes + // from this byte + int nextPrevByte = (toWrite[i] & 0xFF) << leftShiftDistance; + toWrite[i] = (byte) ((prevByte | ((toWrite[i] & 0xFF) >>> m_pointerInByte)) & 0xFF); + prevByte = nextPrevByte; + } + m_curByte = prevByte & 0xFF; + m_out.write(toWrite); + m_numberOfBytesWritten += len; + } + } + + @Override + public void write(int b) throws IOException + { + assertAtByteBoundary(); + m_out.write(b); + m_numberOfBytesWritten++; + } + + @Override + public void write(byte[] barr) throws IOException + { + write(barr, 0, barr.length); + } + + @Override + public void write(byte[] barr, int off, int len) throws IOException + { + assertAtByteBoundary(); + m_out.write(barr, off, len); + m_numberOfBytesWritten += len; + } + + /** + * Close the output stream. + *

+ * This method does not automatically pad the last written bits to a full + * byte. If there are bits written to it the stream must be padded before + * closing it. See {@link #padToByteBoundary()}. + */ + @Override + public void close() throws IOException + { + m_out.close(); + super.close(); + } +} diff --git a/src/main/java/org/at4j/support/io/package-info.java b/src/main/java/org/at4j/support/io/package-info.java new file mode 100644 index 0000000..1ff2ee1 --- /dev/null +++ b/src/main/java/org/at4j/support/io/package-info.java @@ -0,0 +1,25 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +/** + * Support classes that probably would have been in {@code java.io} if they + * had been a part of Java. + * @since 1.0 + * @author Karl Gustafsson + */ +package org.at4j.support.io; \ No newline at end of file diff --git a/src/main/java/org/at4j/support/lang/At4JException.java b/src/main/java/org/at4j/support/lang/At4JException.java new file mode 100644 index 0000000..1bbe136 --- /dev/null +++ b/src/main/java/org/at4j/support/lang/At4JException.java @@ -0,0 +1,58 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.support.lang; + +/** + * This is a base class for exceptions in this project. It inherits + * {@link RuntimeException}, so it is unchecked. + * @author Karl Gustafsson + * @since 1.0 + */ +public class At4JException extends RuntimeException +{ + private static final long serialVersionUID = 1L; + + /** + * Create an exception with a message. + * @param msg The message. + */ + public At4JException(String msg) + { + super(msg); + } + + /** + * Create an exception that wraps another exception. + * @param t The other exception. + */ + public At4JException(Throwable t) + { + super(t); + } + + /** + * Create an exception that wraps another exception and has a message. + * @param msg The message. + * @param t The other exception. + */ + public At4JException(String msg, Throwable t) + { + super(msg, t); + } +} diff --git a/src/main/java/org/at4j/support/lang/SignedInteger.java b/src/main/java/org/at4j/support/lang/SignedInteger.java new file mode 100644 index 0000000..ab31e1b --- /dev/null +++ b/src/main/java/org/at4j/support/lang/SignedInteger.java @@ -0,0 +1,197 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.support.lang; + +/** + * This class represents a signed integer value (i.e. a plain {@code int} + * value). If the {@link java.lang.Integer} class had not been declared {@code + * final}, this class would probably have extended it. + *

+ * Signed integer objects are created by calling any of the static creation + * methods on this class. + *

+ * Instances of this class are immutable. + * @author Karl Gustafsson + * @since 1.1.1 + * @see UnsignedInteger + * @see SignedLong + */ +public class SignedInteger implements Comparable +{ + /** + * This constant represents the value {@code 0}. + */ + public static final SignedInteger ZERO = new SignedInteger(0); + + /** + * This constant represents the value {@code 1}. + */ + public static final SignedInteger ONE = new SignedInteger(1); + + private final int m_value; + + /** + * Create a new signed integer value. + * @param value The value. + */ + private SignedInteger(int value) + { + m_value = value; + } + + /** + * Create a new signed integer value. + * @param value The integer value. + * @return The signed integer value. + */ + public static SignedInteger valueOf(int value) + { + if (value == 0) + { + return ZERO; + } + else if (value == 1) + { + return ONE; + } + else + { + return new SignedInteger(value); + } + } + + /** + * Get the signed integer value. + * @return The signed integer value. + */ + public long intValue() + { + return m_value; + } + + /** + * Get the signed integer value represented as a big-endian byte array (four + * bytes long). + * @return The integer value represented as a big-endian byte array. + * @see #fromBigEndianByteArray(byte[]) + * @see #getLittleEndianByteArray() + */ + public byte[] getBigEndianByteArray() + { + byte[] res = new byte[4]; + res[0] = (byte) m_value; + res[1] = (byte) (m_value >> 8); + res[2] = (byte) (m_value >> 16); + res[3] = (byte) (m_value >> 24); + return res; + } + + /** + * Create a signed integer value from an four bytes long big-endian byte + * array. + * @param barr The byte array. It must be four bytes long. + * @return The signed four value. + * @throws IllegalArgumentException If the byte array is not four bytes + * long. + * @see #getBigEndianByteArray() + * @see #fromLittleEndianByteArray(byte[]) + */ + public static SignedInteger fromBigEndianByteArray(byte[] barr) throws IllegalArgumentException + { + if (barr.length != 4) + { + throw new IllegalArgumentException("Illegal size of supplied byte array: " + barr.length + ". It must be four bytes long"); + } + int value = barr[0] & 0xFF; + value += ((barr[1] & 0xFFL) << 8); + value += ((barr[2] & 0xFFL) << 16); + value += ((barr[3] & 0xFFL) << 24); + return valueOf(value); + } + + /** + * Get the signed integer value represented as a little-endian byte array + * (four bytes long). + * @return The integer value represented as a little-endian byte array. + * @see #getBigEndianByteArray() + * @see #fromBigEndianByteArray(byte[]) + */ + public byte[] getLittleEndianByteArray() + { + byte[] res = new byte[4]; + res[0] = (byte) (m_value >> 24); + res[1] = (byte) (m_value >> 16); + res[2] = (byte) (m_value >> 8); + res[3] = (byte) m_value; + return res; + } + + /** + * Create a signed integer value from an four bytes long little-endian byte + * array. + * @param barr The byte array. It must be four bytes long. + * @return The signed integer value. + * @throws IllegalArgumentException If the byte array is not four bytes + * long. + * @see #getLittleEndianByteArray() + * @see #fromBigEndianByteArray(byte[]) + */ + public static SignedInteger fromLittleEndianByteArray(byte[] barr) throws IllegalArgumentException + { + if (barr.length != 4) + { + throw new IllegalArgumentException("Illegal size of supplied byte array: " + barr.length + ". It must be four bytes long"); + } + int value = barr[3] & 0xFF; + value += ((barr[2] & 0xFFL) << 8); + value += ((barr[1] & 0xFFL) << 16); + value += ((barr[0] & 0xFFL) << 24); + return valueOf(value); + } + + @Override + public boolean equals(Object o) + { + if (o != null && o instanceof SignedInteger) + { + return m_value == ((SignedInteger) o).m_value; + } + else + { + return false; + } + } + + @Override + public int hashCode() + { + return m_value; + } + + public int compareTo(SignedInteger l2) + { + return Integer.valueOf(m_value).compareTo(Integer.valueOf(l2.m_value)); + } + + @Override + public String toString() + { + return "" + m_value; + } +} diff --git a/src/main/java/org/at4j/support/lang/SignedLong.java b/src/main/java/org/at4j/support/lang/SignedLong.java new file mode 100644 index 0000000..f2b3314 --- /dev/null +++ b/src/main/java/org/at4j/support/lang/SignedLong.java @@ -0,0 +1,213 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.support.lang; + +/** + * This class represents a signed long value (i.e. a plain {@code long} value). + * If the {@link java.lang.Long} class had not been declared {@code final}, this + * class would probably have extended it. + *

+ * Signed long objects are created by calling any of the static creation methods + * on this class. + *

+ * Instances of this class are immutable. + * @author Karl Gustafsson + * @since 1.0 + * @see UnsignedLong + * @see SignedInteger + */ +public class SignedLong implements Comparable +{ + /** + * This constant represents the value {@code 0}. + */ + public static final SignedLong ZERO = new SignedLong(0); + + /** + * This constant represents the value {@code 1}. + */ + public static final SignedLong ONE = new SignedLong(1); + + private final long m_value; + + /** + * Create a new signed long value. + * @param value The value. + */ + private SignedLong(long value) + { + m_value = value; + } + + /** + * Create a new signed long value. + * @param value The long value. + * @return The signed long value. + */ + public static SignedLong valueOf(long value) + { + if (value == 0) + { + return ZERO; + } + else if (value == 1) + { + return ONE; + } + else + { + return new SignedLong(value); + } + } + + /** + * Get the signed long value. + * @return The signed long value. + */ + public long longValue() + { + return m_value; + } + + /** + * Get the signed long value represented as a big-endian byte array (eight + * bytes long). + * @return The long value represented as a big-endian byte array. + * @see #fromBigEndianByteArray(byte[]) + * @see #getLittleEndianByteArray() + */ + public byte[] getBigEndianByteArray() + { + byte[] res = new byte[8]; + res[0] = (byte) m_value; + res[1] = (byte) (m_value >> 8); + res[2] = (byte) (m_value >> 16); + res[3] = (byte) (m_value >> 24); + res[4] = (byte) (m_value >> 32); + res[5] = (byte) (m_value >> 40); + res[6] = (byte) (m_value >> 48); + res[7] = (byte) (m_value >> 56); + return res; + } + + /** + * Create a signed long value from an eight bytes long big-endian byte + * array. + * @param barr The byte array. It must be eight bytes long. + * @return The signed long value. + * @throws IllegalArgumentException If the byte array is not eight bytes + * long. + * @see #getBigEndianByteArray() + * @see #fromLittleEndianByteArray(byte[]) + */ + public static SignedLong fromBigEndianByteArray(byte[] barr) throws IllegalArgumentException + { + if (barr.length != 8) + { + throw new IllegalArgumentException("Illegal size of supplied byte array: " + barr.length + ". It must be eight bytes long"); + } + long value = barr[0] & 0xFF; + value += ((barr[1] & 0xFFL) << 8); + value += ((barr[2] & 0xFFL) << 16); + value += ((barr[3] & 0xFFL) << 24); + value += ((barr[4] & 0xFFL) << 32); + value += ((barr[5] & 0xFFL) << 40); + value += ((barr[6] & 0xFFL) << 48); + value += ((barr[7] & 0xFFL) << 56); + return valueOf(value); + } + + /** + * Get the signed long value represented as a little-endian byte array + * (eight bytes long). + * @return The long value represented as a little-endian byte array. + * @see #getBigEndianByteArray() + * @see #fromBigEndianByteArray(byte[]) + */ + public byte[] getLittleEndianByteArray() + { + byte[] res = new byte[8]; + res[0] = (byte) (m_value >> 56); + res[1] = (byte) (m_value >> 48); + res[2] = (byte) (m_value >> 40); + res[3] = (byte) (m_value >> 32); + res[4] = (byte) (m_value >> 24); + res[5] = (byte) (m_value >> 16); + res[6] = (byte) (m_value >> 8); + res[7] = (byte) m_value; + return res; + } + + /** + * Create a signed long value from an eight bytes long little-endian byte + * array. + * @param barr The byte array. It must be eight bytes long. + * @return The signed long value. + * @throws IllegalArgumentException If the byte array is not eight bytes + * long. + * @see #getLittleEndianByteArray() + * @see #fromBigEndianByteArray(byte[]) + */ + public static SignedLong fromLittleEndianByteArray(byte[] barr) throws IllegalArgumentException + { + if (barr.length != 8) + { + throw new IllegalArgumentException("Illegal size of supplied byte array: " + barr.length + ". It must be eight bytes long"); + } + long value = barr[7] & 0xFF; + value += ((barr[6] & 0xFFL) << 8); + value += ((barr[5] & 0xFFL) << 16); + value += ((barr[4] & 0xFFL) << 24); + value += ((barr[3] & 0xFFL) << 32); + value += ((barr[2] & 0xFFL) << 40); + value += ((barr[1] & 0xFFL) << 48); + value += ((barr[0] & 0xFFL) << 56); + return valueOf(value); + } + + @Override + public boolean equals(Object o) + { + if (o != null && o instanceof SignedLong) + { + return m_value == ((SignedLong) o).m_value; + } + else + { + return false; + } + } + + @Override + public int hashCode() + { + return (int) (m_value ^ (m_value >>> 32)); + } + + public int compareTo(SignedLong l2) + { + return Long.valueOf(m_value).compareTo(Long.valueOf(l2.m_value)); + } + + @Override + public String toString() + { + return "" + m_value; + } +} diff --git a/src/main/java/org/at4j/support/lang/UnsignedByte.java b/src/main/java/org/at4j/support/lang/UnsignedByte.java new file mode 100644 index 0000000..b72715f --- /dev/null +++ b/src/main/java/org/at4j/support/lang/UnsignedByte.java @@ -0,0 +1,197 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.support.lang; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Serializable; + + +/** + * This object represents an unsigned byte (eight bits) with a value between + * {@code 0} and {@code 255} (inclusive). It is immutable. + *

+ * Unsigned byte instances are created by calling any of the static {@code + * valueOf} methods on this class. + * @author Karl Gustafsson + * @since 1.0 + * @see UnsignedShort + * @see UnsignedInteger + * @see UnsignedLong + */ +public final class UnsignedByte implements Serializable, Comparable +{ + private static final long serialVersionUID = 1L; + + /** + * The maximum value of an unsigned byte (255). + */ + public static final short MAX_VALUE = (1 << 8) - 1; + + /** + * The minimum value of an unsigned byte (0). + */ + public static final short MIN_VALUE = 0; + + /** + * The value 0. + */ + public static final UnsignedByte ZERO = new UnsignedByte((byte) 0); + + /** + * The value 1. + */ + public static final UnsignedByte ONE = new UnsignedByte((byte) 1); + + private final byte m_value; + + private UnsignedByte(byte value) + { + m_value = value; + } + + /** + * Create an unsigned byte value from the supplied byte value. The supplied + * value is treated as if it was unsigned, which means that negative + * argument values will result in unsigned byte values between 128 and 255. + * @param value The value. + * @return The unsigned byte value. + * @see #valueOf(short) + * @see #valueOf(int) + */ + public static UnsignedByte valueOf(byte value) + { + switch (value) + { + case 0: + return ZERO; + case 1: + return ONE; + default: + return new UnsignedByte(value); + } + } + + private static UnsignedByte valueOfSafe(int value) + { + return valueOf((byte) (value & 0xFF)); + } + + /** + * Create a new unsigned byte value from the supplied {@code short} value + * which must be in the range {@code 0} to {@code 255} (inclusive). + * @param value The value. + * @return An unsigned byte value. + * @throws IllegalArgumentException If the supplied value is not in the + * permitted range. + */ + public static UnsignedByte valueOf(short value) throws IllegalArgumentException + { + if ((value < MIN_VALUE) || (value > MAX_VALUE)) + { + throw new IllegalArgumentException("Illegal unsigned byte value " + value + ". It must be between " + MIN_VALUE + " and " + MAX_VALUE + " (inclusive)"); + } + return valueOf((byte) (value & 0xFF)); + } + + /** + * Create a new unsigned byte value from the supplied {@code int} value + * which must be in the range {@code 0} to {@code 255} (inclusive). + * @param value The value. + * @return An unsigned byte value. + * @throws IllegalArgumentException If the supplied value is not in the + * permitted range. + */ + public static UnsignedByte valueOf(int value) throws IllegalArgumentException + { + if ((value < MIN_VALUE) || (value > MAX_VALUE)) + { + throw new IllegalArgumentException("Illegal unsigned byte value " + value + ". It must be between " + MIN_VALUE + " and " + MAX_VALUE + " (inclusive)"); + } + return valueOf((byte) (value & 0xFF)); + } + + /** + * Get the unsigned byte value as an {@code int}. + * @return The value. + */ + public int intValue() + { + return m_value & 0xFF; + } + + /** + * Get the unsigned byte value as a {@code short}. + * @return The value. + */ + public short shortValue() + { + return (short) (m_value & 0xFF); + } + + /** + * Get the unsigned byte value as a signed byte value between {@code -128} + * and {@code 127} (inclusive). + * @return The value. + */ + public byte byteValue() + { + return m_value; + } + + /** + * Is the specified bit set in the byte value? + * @param no The index number of the bit. Bit 0 is the bit representing the + * value 1, bit 7 is the bit representing the value 128. + * @return {@code true} if the specified bit is set. + * @throws IllegalArgumentException If {@code no} is not in the range + * {@code 0 <= no <= 7} (inclusive). + */ + public boolean isBitSet(int no) throws IllegalArgumentException + { + if (no < 0 || no > 7) + { + throw new IllegalArgumentException("Invalid bit number " + no + ". It must be between 0 and 7 (inclusive)"); + } + return (m_value & (1 << no)) > 0; + } + + @Override + public boolean equals(Object o) + { + return (o instanceof UnsignedByte) && (((UnsignedByte) o).m_value == m_value); + } + + @Override + public int hashCode() + { + return m_value; + } + + public int compareTo(UnsignedByte b2) + { + return intValue() - b2.intValue(); + } + + @Override + public String toString() + { + return Short.toString((short) (m_value & 0xFF)); + } +} diff --git a/src/main/java/org/at4j/support/lang/UnsignedInteger.java b/src/main/java/org/at4j/support/lang/UnsignedInteger.java new file mode 100644 index 0000000..f6dfed3 --- /dev/null +++ b/src/main/java/org/at4j/support/lang/UnsignedInteger.java @@ -0,0 +1,243 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.support.lang; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Serializable; + +/** + * This object represents an unsigned integer (four bytes or 32 bits) with a + * value between {code 0} and {@code 4294967295}. It is immutable. + *

+ * Unsigned integers are created by calling any of the static creation methods + * of this class. + * @author Karl Gustafsson + * @since 1.0 + * @see SignedInteger + * @see UnsignedByte + * @see UnsignedShort + * @see UnsignedLong + */ +public final class UnsignedInteger implements Serializable, Comparable +{ + private static final long serialVersionUID = 1L; + + /** + * Each unsigned integer is four bytes long. + */ + public static final int SIZE = 4; + + /** + * The maximum value of an unsigned integer (4294967295). + */ + public static final long MAX_VALUE = (1L << 32) - 1; + + /** + * The minimum value of an unsigned integer (0). + */ + public static final int MIN_VALUE = 0; + + /** + * The value 0. + */ + public static final UnsignedInteger ZERO = new UnsignedInteger(0); + + /** + * The value 1. + */ + public static final UnsignedInteger ONE = new UnsignedInteger(1); + + private final int m_value; + + private UnsignedInteger(int value) + { + m_value = value; + } + + /** + * Create a new unsigned integer. The supplied integer is treated as an + * unsigned value, which means that negative argument values will result in + * unsigned integer values between {@code 2147483648} and {@code 4294967295} + * (inclusive). + * @param value The signed integer value. + * @return An unsigned integer value. + */ + public static UnsignedInteger valueOf(int value) + { + switch (value) + { + case 0: + return ZERO; + case 1: + return ONE; + default: + return new UnsignedInteger(value); + } + } + + /** + * Create an unsigned integer from the supplied long value which must be + * between {@code 0} and {@code 4294967295} (inclusive). + * @param value The value. + * @return The unsigned integer value. + * @throws IllegalArgumentException If the supplied value is not in the + * permitted range. + */ + public static UnsignedInteger valueOf(long value) throws IllegalArgumentException + { + if ((value < MIN_VALUE) || (value > MAX_VALUE)) + { + throw new IllegalArgumentException("Illegal unsigned integer value " + value + ". It must be between " + MIN_VALUE + " and " + MAX_VALUE + " (inclusive)"); + } + return valueOf((int) (value & 0xFFFFFFFF)); + } + + /** + * Get the unsigned integer value represented as a {@code long}. + * @return The value. + */ + public long longValue() + { + return m_value & 0xFFFFFFFFL; + } + + /** + * Get the unsigned integer value converted to a signed integer. + * @return The unsigned integer value converted to a signed integer. + */ + public int intValue() + { + return m_value; + } + + /** + * Get the unsigned integer value as a big-endian, four bytes long byte + * array. + * @return The value represented as a big-endian byte array. + */ + public byte[] getBigEndianByteArray() + { + byte[] res = new byte[4]; + res[0] = (byte) (m_value & 0xFF); + res[1] = (byte) ((m_value >>> 8) & 0xFF); + res[2] = (byte) ((m_value >>> 16) & 0xFF); + res[3] = (byte) ((m_value >>> 24) & 0xFF); + return res; + } + + /** + * Create an unsigned integer value from a four bytes long, big-endian byte + * array. + * @param barr The byte array. It must be four bytes long. + * @return The unsigned integer. + * @throws IllegalArgumentException If the supplied byte array is not four + * bytes long. + * @see #fromBigEndianByteArray(byte[], int) + * @see #fromBigEndianByteArrayToLong(byte[], int) + */ + public static UnsignedInteger fromBigEndianByteArray(byte[] barr) throws IllegalArgumentException + { + if (barr.length != 4) + { + throw new IllegalArgumentException("The supplied byte array must be four bytes long"); + } + return fromBigEndianByteArray(barr, 0); + } + + /** + * Create an unsigned integer value from four bytes read from the given + * offset position in the supplied byte array. The most significant byte is + * the last byte read. + * @param barr The byte array to read from. + * @param offset The offset in the byte array where the least significant + * (first) byte is. + * @return An unsigned integer. + * @throws ArrayIndexOutOfBoundsException If the supplied array is too short + * or if the offset is negative. + * @see #fromBigEndianByteArray(byte[]) + * @see #fromBigEndianByteArrayToLong(byte[], int) + */ + public static UnsignedInteger fromBigEndianByteArray(byte[] barr, int offset) throws ArrayIndexOutOfBoundsException + { + return valueOf((barr[offset] & 0xFF) + ((barr[offset + 1] & 0xFF) << 8) + ((barr[offset + 2] & 0xFF) << 16) + ((barr[offset + 3] & 0xFF) << 24)); + } + + /** + * Create a long value representing the unsigned integer value in the byte + * array at the specified offset. The most significant byte is the last byte + * read. + * @param barr The byte array to read from. + * @param offset The offset in the byte array where the least significant + * (first) byte is. + * @return A {@code long} representing the unsigned integer. + * @throws ArrayIndexOutOfBoundsException If the supplied array is too short + * or if the offset is negative. + * @see #fromBigEndianByteArray(byte[]) + * @see #fromBigEndianByteArray(byte[], int) + * @see #fromLittleEndianByteArrayToLong(byte[], int) + * @since 1.1 + */ + public static long fromBigEndianByteArrayToLong(byte[] barr, int offset) throws ArrayIndexOutOfBoundsException + { + return (barr[offset] & 0xFF) + ((barr[offset + 1] & 0xFF) << 8) + ((barr[offset + 2] & 0xFF) << 16) + ((barr[offset + 3] & 0xFF) << 24); + } + + /** + * Create a long value representing the unsigned integer value in the byte + * array at the specified offset. The most significant byte is the first + * byte read. + * @param barr The byte array to read from. + * @param offset The offset in the byte array where the most significant + * (first) byte is. + * @return A {@code long} representing the unsigned integer. + * @throws ArrayIndexOutOfBoundsException If the supplied array is too short + * or if the offset is negative. + * @see #fromBigEndianByteArrayToLong(byte[], int) + * @since 1.1 + */ + public static long fromLittleEndianByteArrayToLong(byte[] barr, int offset) throws ArrayIndexOutOfBoundsException + { + return (barr[offset + 3] & 0xFF) + ((barr[offset + 2] & 0xFF) << 8) + ((barr[offset + 1] & 0xFF) << 16) + ((barr[offset] & 0xFF) << 24); + } + + + @Override + public boolean equals(Object o) + { + return (o instanceof UnsignedInteger) && (((UnsignedInteger) o).m_value == m_value); + } + + @Override + public int hashCode() + { + return m_value; + } + + public int compareTo(UnsignedInteger i2) + { + return Long.valueOf(longValue()).compareTo(Long.valueOf(i2.longValue())); + } + + @Override + public String toString() + { + return Long.toString(m_value & 0xFFFFFFFFL); + } +} diff --git a/src/main/java/org/at4j/support/lang/UnsignedLong.java b/src/main/java/org/at4j/support/lang/UnsignedLong.java new file mode 100644 index 0000000..4ac3ac4 --- /dev/null +++ b/src/main/java/org/at4j/support/lang/UnsignedLong.java @@ -0,0 +1,224 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.support.lang; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Serializable; +import java.math.BigInteger; + +/** + * This object represents an unsigned long (eight bytes or 64 bits) with a value + * between {code 0} and {@code 18446744073709551615}. It is immutable. + *

+ * Unsigned longs are created by calling any of the static creation methods of + * this class. + * @author Karl Gustafsson + * @since 1.0 + * @see SignedLong + * @see UnsignedByte + * @see UnsignedShort + * @see UnsignedInteger + */ +public final class UnsignedLong implements Serializable, Comparable +{ + private static final long serialVersionUID = 1L; + + /** + * The minimum allowed value (0). + */ + public static final BigInteger MIN_VALUE = BigInteger.valueOf(0L); + + /** + * The maximum allowed value (18446744073709551615). + */ + public static final BigInteger MAX_VALUE; + + /** + * The value zero. + */ + public static final UnsignedLong ZERO = new UnsignedLong(0L); + + /** + * The value one. + */ + public static final UnsignedLong ONE = new UnsignedLong(1L); + + private static final BigInteger HIGHEST_BIT_VALUE; + static + { + BigInteger mv = BigInteger.valueOf(2L); + MAX_VALUE = mv.pow(64).subtract(BigInteger.ONE); + HIGHEST_BIT_VALUE = mv.pow(63); + } + + private final long m_value; + + private UnsignedLong(long value) + { + m_value = value; + } + + /** + * Create an unsigned long. The supplied value is treated as an unsigned + * long, which means that negative argument values will result in unsigned + * long values between {@code 9223372036854775808} and {@code + * 18446744073709551615} (inclusive). + * @param value The value. + * @return An unsigned long value. + */ + public static UnsignedLong valueOf(long value) + { + if (value == 0L) + { + return ZERO; + } + else if (value == 1L) + { + return ONE; + } + else + { + return new UnsignedLong(value); + } + } + + /** + * Create an unsigned long value from the supplied {@link BigInteger} value + * which must be in the range {@code 0} to {@code 18446744073709551615} + * (inclusive) + * @param value The value. + * @return An unsigned long value. + * @throws IllegalArgumentException If the supplied value is negative or if + * it is greater than {@link #MAX_VALUE}. + */ + public static UnsignedLong valueOf(BigInteger value) throws IllegalArgumentException + { + if ((value.compareTo(MIN_VALUE) < 0) || (value.compareTo(MAX_VALUE) > 0)) + { + throw new IllegalArgumentException("Illegal unsigned long value " + value + ". It must be between 0 and " + MAX_VALUE + " (inclusive)"); + } + return valueOf(value.longValue()); + } + + /** + * Get the unsigned long value as a {@link BigInteger}. + * @return The unsigned long value as a {@link BigInteger}. + */ + public BigInteger bigIntValue() + { + BigInteger res = BigInteger.valueOf(m_value & 0x7FFFFFFFFFFFFFFFL); + return m_value < 0 ? res.add(HIGHEST_BIT_VALUE) : res; + } + + /** + * Return the value as a signed long. If the value is less than + * {@link Long#MAX_VALUE}, it is returned as a positive long. If not, it is + * returned as a negative long. + * @return The value as a signed long value. + */ + public long longValue() + { + return m_value; + } + + /** + * Get the unsigned long value as a big-endian, eight bytes long byte array. + * @return The value represented as a big-endian byte array. + */ + public byte[] getBigEndianByteArray() + { + byte[] res = new byte[8]; + res[0] = (byte) (m_value & 0xFF); + res[1] = (byte) ((m_value >>> 8) & 0xFF); + res[2] = (byte) ((m_value >>> 16) & 0xFF); + res[3] = (byte) ((m_value >>> 24) & 0xFF); + res[4] = (byte) ((m_value >>> 32) & 0xFF); + res[5] = (byte) ((m_value >>> 40) & 0xFF); + res[6] = (byte) ((m_value >>> 48) & 0xFF); + res[7] = (byte) ((m_value >>> 56) & 0xFF); + return res; + } + + /** + * Create an unsigned long value from a eight bytes long, big-endian byte + * array. + * @param barr The byte array. It must be eight bytes long. + * @return The unsigned long. + * @throws IllegalArgumentException If the supplied byte array is not eight + * bytes long. + * @see #fromBigEndianByteArray(byte[], int) + */ + public static UnsignedLong fromBigEndianByteArray(byte[] barr) throws IllegalArgumentException + { + if (barr.length != 8) + { + throw new IllegalArgumentException("The supplied byte array must be eight bytes long"); + } + return fromBigEndianByteArray(barr, 0); + } + + /** + * Create an unsigned long value from eight bytes read from the given offset + * position in the supplied byte array. The most significant byte is the + * last byte read. + * @param barr The byte array to read from. + * @param offset The offset in the byte array where the least significant + * (first) byte is. + * @return An unsigned long. + * @throws ArrayIndexOutOfBoundsException If the supplied array is too short + * or if the offset is negative. + * @see #fromBigEndianByteArray(byte[]) + */ + public static UnsignedLong fromBigEndianByteArray(byte[] barr, int offset) throws ArrayIndexOutOfBoundsException + { + return valueOf((barr[offset] & 0xFFL) + ((barr[offset + 1] & 0xFFL) << 8) + ((barr[offset + 2] & 0xFFL) << 16) + ((barr[offset + 3] & 0xFFL) << 24) + ((barr[offset + 4] & 0xFFL) << 32) + ((barr[offset + 5] & 0xFFL) << 40) + + ((barr[offset + 6] & 0xFFL) << 48) + ((barr[offset + 7] & 0xFFL) << 56)); + } + + @Override + public boolean equals(Object o) + { + if (o != null && o instanceof UnsignedLong) + { + return m_value == ((UnsignedLong) o).m_value; + } + else + { + return false; + } + } + + @Override + public int hashCode() + { + return (int) (m_value ^ (m_value >>> 32)); + } + + public int compareTo(UnsignedLong l2) + { + return bigIntValue().compareTo(l2.bigIntValue()); + } + + @Override + public String toString() + { + return bigIntValue().toString(); + } +} diff --git a/src/main/java/org/at4j/support/lang/UnsignedShort.java b/src/main/java/org/at4j/support/lang/UnsignedShort.java new file mode 100644 index 0000000..34786fd --- /dev/null +++ b/src/main/java/org/at4j/support/lang/UnsignedShort.java @@ -0,0 +1,197 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +package org.at4j.support.lang; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Serializable; + + +/** + * This object represents an unsigned short value (two bytes or 16 bits) with a + * value between {code 0} and {@code 65535}. It is immutable. + *

+ * Unsigned shorts are created by calling any of the static creation methods of + * this class. + * @author Karl Gustafsson + * @since 1.0 + * @see UnsignedByte + * @see UnsignedInteger + * @see UnsignedLong + */ +public final class UnsignedShort implements Serializable, Comparable +{ + private static final long serialVersionUID = 1L; + + /** + * Each unsigned short is two bytes long. + */ + public static final int SIZE = 2; + + /** + * The maximum value of an unsigned short (65535). + */ + public static final int MAX_VALUE = (1 << 16) - 1; + + /** + * The minimum value of an unsigned short (0). + */ + public static final int MIN_VALUE = 0; + + /** + * The value 0. + */ + public static final UnsignedShort ZERO = new UnsignedShort((short) 0); + + /** + * The value 1. + */ + public static final UnsignedShort ONE = new UnsignedShort((short) 1); + + /** + * The value 1000. + */ + public static final UnsignedShort ONE_THOUSAND = new UnsignedShort((short) 1000); + + private final short m_value; + + private UnsignedShort(short value) + { + m_value = value; + } + + /** + * Create a new unsigned short. The supplied short is treated as an unsigned + * value, which means that negative argument values will result in unsigned + * short values between {@code 32768} and {@code 65535} (inclusive). + * @param value The signed short value. + * @return An unsigned short value. + */ + public static UnsignedShort valueOf(short value) + { + switch (value) + { + case 0: + return ZERO; + case 1: + return ONE; + case 1000: + return ONE_THOUSAND; + default: + return new UnsignedShort(value); + } + } + + /** + * Create an unsigned short from the supplied integer value which must be + * between {@code 0} and {@code 65535} (inclusive). + * @param value The value. + * @return The unsigned short value. + * @throws IllegalArgumentException If the supplied value is not in the + * permitted range. + */ + public static UnsignedShort valueOf(int value) throws IllegalArgumentException + { + if ((value < MIN_VALUE) || (value > MAX_VALUE)) + { + throw new IllegalArgumentException("Illegal unsigned short value " + value + ". It must be between " + MIN_VALUE + " and " + MAX_VALUE + " (inclusive)"); + } + return valueOf((short) (value & 0xFFFF)); + } + + /** + * Get the unsigned short value. + * @return The value. + */ + public int intValue() + { + return m_value & 0xFFFF; + } + + /** + * Get the unsigned short value as a big-endian, two bytes long byte array. + * @return The value represented as a big-endian byte array. + */ + public byte[] getBigEndianByteArray() + { + byte[] res = new byte[2]; + res[0] = (byte) (m_value & 0xFF); + res[1] = (byte) ((m_value >>> 8) & 0xFF); + return res; + } + + /** + * Create an unsigned short value from a two bytes long, big-endian byte + * array. + * @param barr The byte array. It must be two bytes long. + * @return The unsigned short. + * @throws IllegalArgumentException If the supplied byte array is not two + * bytes long. + * @see #fromBigEndianByteArray(byte[], int) + */ + public static UnsignedShort fromBigEndianByteArray(byte[] barr) throws IllegalArgumentException + { + if (barr.length != 2) + { + throw new IllegalArgumentException("The supplied byte array must be two bytes long"); + } + return fromBigEndianByteArray(barr, 0); + } + + /** + * Create an unsigned short value from two bytes read from the given offset + * position in the supplied byte array. The most significant byte is the + * last byte read. + * @param barr The byte array to read from. + * @param offset The offset in the byte array where the least significant + * (first) byte is. + * @return An unsigned short. + * @throws ArrayIndexOutOfBoundsException If the supplied array is too short + * or if the offset is negative. + * @see #fromBigEndianByteArray(byte[]) + */ + public static UnsignedShort fromBigEndianByteArray(byte[] barr, int offset) throws ArrayIndexOutOfBoundsException + { + return valueOf((short) ((barr[offset] & 0xFF) + ((barr[offset + 1] & 0xFF) << 8) & 0xFFFF)); + } + + + @Override + public boolean equals(Object o) + { + return (o instanceof UnsignedShort) && (((UnsignedShort) o).m_value == m_value); + } + + @Override + public int hashCode() + { + return m_value; + } + + public int compareTo(UnsignedShort s2) + { + return intValue() - s2.intValue(); + } + + @Override + public String toString() + { + return Integer.toString(m_value & 0xFFFF); + } +} diff --git a/src/main/java/org/at4j/support/lang/package-info.java b/src/main/java/org/at4j/support/lang/package-info.java new file mode 100644 index 0000000..c03069e --- /dev/null +++ b/src/main/java/org/at4j/support/lang/package-info.java @@ -0,0 +1,25 @@ +/* AT4J -- Archive file tools for Java -- http://www.at4j.org + * Copyright (C) 2009 Karl Gustafsson + * + * This file is a part of AT4J + * + * AT4J is free software: you can redistribute it and/or modify it under the + * terms of the GNU Lesser General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * AT4J is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +/** + * Support classes that probably would have been in {@code java.lang} if they + * had been a part of Java. + * @since 1.0 + * @author Karl Gustafsson + */ +package org.at4j.support.lang; \ No newline at end of file