3435 lines
101 KiB
C++
Raw Normal View History

2022-04-07 18:46:57 +02:00
///////////////////////////////////////////////////////////////////////////
//
// Copyright (c) 2009-2014 DreamWorks Animation LLC.
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of DreamWorks Animation nor the names of
// its contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
///////////////////////////////////////////////////////////////////////////
//---------------------------------------------------
//
// class DwaCompressor -- Store lossy RGB data by quantizing
// DCT components.
//
// First, we try and figure out what compression strategy to take
// based in channel name. For RGB channels, we want a lossy method
// described below. But, if we have alpha, we should do something
// different (and probably using RLE). If we have depth, or velocity,
// or something else, just fall back to ZIP. The rules for deciding
// which strategy to use are setup in initializeDefaultChannelRules().
// When writing a file, the relevant rules needed to decode are written
// into the start of the data block, making a self-contained file.
// If initializeDefaultChannelRules() doesn't quite suite your naming
// conventions, you can adjust the rules without breaking decoder
// compatability.
//
// If we're going to lossy compress R, G, or B channels, it's easier
// to toss bits in a more perceptual uniform space. One could argue
// at length as to what constitutes perceptually uniform, expecially
// when storing either scene/input/focal plane referred and output referred
// data.
//
// We'll compromise. For values <= 1, we use a traditional power function
// (without any of that straight-line business at the bottom). For values > 1,
// we want something more like a log function, since power functions blow
// up. At 1, we want a smooth blend between the functions. So, we use a
// piecewise function that does just that - see dwaLookups.cpp for
// a little more detail.
//
// Also, if we find that we have R, G, and B channels from the same layer,
// we can get a bit more compression efficiency by transforming to a Y'CbCr
// space. We use the 709 transform, but with Cb,Cr = 0 for an input of
// (0, 0, 0), instead of the traditional Cb,Cr = .5. Shifting the zero point
// makes no sense with large range data. Transforms are done to from
// the perceptual space data, not the linear-light space data (R'G'B' ->
// (Y'CbCr, not RGB -> YCbCr).
//
// Next, we forward DCT the data. This is done with a floating
// point DCT, as we don't really have control over the src range. The
// resulting values are dropped to half-float precision.
//
// Now, we need to quantize. Quantization departs from the usual way
// of dividing and rounding. Instead, we start with some floating
// point "base-error" value. From this, we can derive quantization
// error for each DCT component. Take the standard JPEG quantization
// tables and normalize them by the smallest value. Then, multiply
// the normalized quant tables by our base-error value. This gives
// a range of errors for each DCT component.
//
// For each DCT component, we want to find a quantized value that
// is within +- the per-component error. Pick the quantized value
// that has the fewest bits set in its' binary representation.
// Brute-forcing the search would make for extremly inefficient
// compression. Fortunatly, we can precompute a table to assist
// with this search.
//
// For each 16-bit float value, there are at most 15 other values with
// fewer bits set. We can precompute these values in a compact form, since
// many source values have far fewer that 15 possible quantized values.
// Now, instead of searching the entire range +- the component error,
// we can just search at most 15 quantization candidates. The search can
// be accelerated a bit more by sorting the candidates by the
// number of bits set, in increasing order. Then, the search can stop
// once a candidate is found w/i the per-component quantization
// error range.
//
// The quantization strategy has the side-benefit that there is no
// de-quantization step upon decode, so we don't bother recording
// the quantization table.
//
// Ok. So we now have quantized values. Time for entropy coding. We
// can use either static Huffman or zlib/DEFLATE. The static Huffman
// is more efficient at compacting data, but can have a greater
// overhead, especially for smaller tile/strip sizes.
//
// There is some additional fun, like ZIP compressing the DC components
// instead of Huffman/zlib, which helps make things slightly smaller.
//
// Compression level is controlled by setting an int/float/double attribute
// on the header named "dwaCompressionLevel". This is a thinly veiled name for
// the "base-error" value mentioned above. The "base-error" is just
// dwaCompressionLevel / 100000. The default value of 45.0 is generally
// pretty good at generating "visually lossless" values at reasonable
// data rates. Setting dwaCompressionLevel to 0 should result in no additional
// quantization at the quantization stage (though there may be
// quantization in practice at the CSC/DCT steps). But if you really
// want lossless compression, there are pleanty of other choices
// of compressors ;)
//
// When dealing with FLOAT source buffers, we first quantize the source
// to HALF and continue down as we would for HALF source.
//
//---------------------------------------------------
#include "ImfDwaCompressor.h"
#include "ImfDwaCompressorSimd.h"
#include "ImfChannelList.h"
#include "ImfStandardAttributes.h"
#include "ImfHeader.h"
#include "ImfHuf.h"
#include "ImfInt64.h"
#include "ImfIntAttribute.h"
#include "ImfIO.h"
#include "ImfMisc.h"
#include "ImfNamespace.h"
#include "ImfRle.h"
#include "ImfSimd.h"
#include "ImfSystemSpecific.h"
#include "ImfXdr.h"
#include "ImfZip.h"
#include "ImathFun.h"
#include "ImathBox.h"
#include "ImathVec.h"
#include "half.h"
#include "halfLimits.h"
#include "dwaLookups.h"
#include <vector>
#include <string>
#include <cctype>
#include <cassert>
#include <algorithm>
// Windows specific addition to prevent the indirect import of the redefined min/max macros
#if defined _WIN32 || defined _WIN64
#ifdef NOMINMAX
#undef NOMINMAX
#endif
#define NOMINMAX
#endif
#include <zlib.h>
OPENEXR_IMF_INTERNAL_NAMESPACE_SOURCE_ENTER
namespace {
//
// Function pointer to dispatch to an approprate
// convertFloatToHalf64_* impl, based on runtime cpu checking.
// Should be initialized in DwaCompressor::initializeFuncs()
//
void (*convertFloatToHalf64)(unsigned short*, float*) =
convertFloatToHalf64_scalar;
//
// Function pointer for dispatching a fromHalfZigZag_ impl
//
void (*fromHalfZigZag)(unsigned short*, float*) =
fromHalfZigZag_scalar;
//
// Dispatch the inverse DCT on an 8x8 block, where the last
// n rows can be all zeros. The n=0 case converts the full block.
//
void (*dctInverse8x8_0)(float*) = dctInverse8x8_scalar<0>;
void (*dctInverse8x8_1)(float*) = dctInverse8x8_scalar<1>;
void (*dctInverse8x8_2)(float*) = dctInverse8x8_scalar<2>;
void (*dctInverse8x8_3)(float*) = dctInverse8x8_scalar<3>;
void (*dctInverse8x8_4)(float*) = dctInverse8x8_scalar<4>;
void (*dctInverse8x8_5)(float*) = dctInverse8x8_scalar<5>;
void (*dctInverse8x8_6)(float*) = dctInverse8x8_scalar<6>;
void (*dctInverse8x8_7)(float*) = dctInverse8x8_scalar<7>;
} // namespace
struct DwaCompressor::ChannelData
{
std::string name;
CompressorScheme compression;
int xSampling;
int ySampling;
PixelType type;
bool pLinear;
int width;
int height;
//
// Incoming and outgoing data is scanline interleaved, and it's much
// easier to operate on contiguous data. Assuming the planare unc
// buffer is to hold RLE data, we need to rearrange to make bytes
// adjacent.
//
char *planarUncBuffer;
char *planarUncBufferEnd;
char *planarUncRle[4];
char *planarUncRleEnd[4];
PixelType planarUncType;
int planarUncSize;
};
struct DwaCompressor::CscChannelSet
{
int idx[3];
};
struct DwaCompressor::Classifier
{
Classifier (std::string suffix,
CompressorScheme scheme,
PixelType type,
int cscIdx,
bool caseInsensitive):
_suffix(suffix),
_scheme(scheme),
_type(type),
_cscIdx(cscIdx),
_caseInsensitive(caseInsensitive)
{
if (caseInsensitive)
transform(_suffix.begin(), _suffix.end(), _suffix.begin(), tolower);
}
Classifier (const char *&ptr, int size)
{
if (size <= 0)
throw Iex::InputExc("Error uncompressing DWA data"
" (truncated rule).");
{
char suffix[Name::SIZE];
memset (suffix, 0, Name::SIZE);
Xdr::read<CharPtrIO> (ptr, std::min(size, Name::SIZE-1), suffix);
_suffix = std::string(suffix);
}
if (size < _suffix.length() + 1 + 2*Xdr::size<char>())
throw Iex::InputExc("Error uncompressing DWA data"
" (truncated rule).");
char value;
Xdr::read<CharPtrIO> (ptr, value);
_cscIdx = (int)(value >> 4) - 1;
if (_cscIdx < -1 || _cscIdx >= 3)
throw Iex::InputExc("Error uncompressing DWA data"
" (corrupt cscIdx rule).");
_scheme = (CompressorScheme)((value >> 2) & 3);
if (_scheme < 0 || _scheme >= NUM_COMPRESSOR_SCHEMES)
throw Iex::InputExc("Error uncompressing DWA data"
" (corrupt scheme rule).");
_caseInsensitive = (value & 1 ? true : false);
Xdr::read<CharPtrIO> (ptr, value);
if (value < 0 || value >= NUM_PIXELTYPES)
throw Iex::InputExc("Error uncompressing DWA data"
" (corrupt rule).");
_type = (PixelType)value;
}
bool match (const std::string &suffix, const PixelType type) const
{
if (_type != type) return false;
if (_caseInsensitive)
{
std::string tmp(suffix);
transform(tmp.begin(), tmp.end(), tmp.begin(), tolower);
return tmp == _suffix;
}
return suffix == _suffix;
}
size_t size () const
{
// string length + \0
size_t sizeBytes = _suffix.length() + 1;
// 1 byte for scheme / cscIdx / caseInsensitive, and 1 byte for type
sizeBytes += 2 * Xdr::size<char>();
return sizeBytes;
}
void write (char *&ptr) const
{
Xdr::write<CharPtrIO> (ptr, _suffix.c_str());
// Encode _cscIdx (-1-3) in the upper 4 bits,
// _scheme (0-2) in the next 2 bits
// _caseInsen in the bottom bit
unsigned char value = 0;
value |= ((unsigned char)(_cscIdx+1) & 15) << 4;
value |= ((unsigned char)_scheme & 3) << 2;
value |= (unsigned char)_caseInsensitive & 1;
Xdr::write<CharPtrIO> (ptr, value);
Xdr::write<CharPtrIO> (ptr, (unsigned char)_type);
}
std::string _suffix;
CompressorScheme _scheme;
PixelType _type;
int _cscIdx;
bool _caseInsensitive;
};
//
// Base class for the LOSSY_DCT decoder classes
//
class DwaCompressor::LossyDctDecoderBase
{
public:
LossyDctDecoderBase
(char *packedAc,
char *packedDc,
const unsigned short *toLinear,
int width,
int height);
virtual ~LossyDctDecoderBase ();
void execute();
//
// These return number of items, not bytes. Each item
// is an unsigned short
//
int numAcValuesEncoded() const { return _packedAcCount; }
int numDcValuesEncoded() const { return _packedDcCount; }
protected:
//
// Un-RLE the packed AC components into
// a half buffer. The half block should
// be the full 8x8 block (in zig-zag order
// still), not the first AC component.
//
// currAcComp is advanced as bytes are decoded.
//
// This returns the index of the last non-zero
// value in the buffer - with the index into zig zag
// order data. If we return 0, we have DC only data.
//
int unRleAc (unsigned short *&currAcComp,
unsigned short *halfZigBlock);
//
// if NATIVE and XDR are really the same values, we can
// skip some processing and speed things along
//
bool _isNativeXdr;
//
// Counts of how many items have been packed into the
// AC and DC buffers
//
int _packedAcCount;
int _packedDcCount;
//
// AC and DC buffers to pack
//
char *_packedAc;
char *_packedDc;
//
// half -> half LUT to transform from nonlinear to linear
//
const unsigned short *_toLinear;
//
// image dimensions
//
int _width;
int _height;
//
// Pointers to the start of each scanlines, to be filled on decode
// Generally, these will be filled by the subclasses.
//
std::vector< std::vector<char *> > _rowPtrs;
//
// The type of each data that _rowPtrs[i] is referring. Layout
// is in the same order as _rowPtrs[].
//
std::vector<PixelType> _type;
std::vector<SimdAlignedBuffer64f> _dctData;
};
//
// Used to decode a single channel of LOSSY_DCT data.
//
class DwaCompressor::LossyDctDecoder: public LossyDctDecoderBase
{
public:
//
// toLinear is a half-float LUT to convert the encoded values
// back to linear light. If you want to skip this step, pass
// in NULL here.
//
LossyDctDecoder
(std::vector<char *> &rowPtrs,
char *packedAc,
char *packedDc,
const unsigned short *toLinear,
int width,
int height,
PixelType type)
:
LossyDctDecoderBase(packedAc, packedDc, toLinear, width, height)
{
_rowPtrs.push_back(rowPtrs);
_type.push_back(type);
}
virtual ~LossyDctDecoder () {}
};
//
// Used to decode 3 channels of LOSSY_DCT data that
// are grouped together and color space converted.
//
class DwaCompressor::LossyDctDecoderCsc: public LossyDctDecoderBase
{
public:
//
// toLinear is a half-float LUT to convert the encoded values
// back to linear light. If you want to skip this step, pass
// in NULL here.
//
LossyDctDecoderCsc
(std::vector<char *> &rowPtrsR,
std::vector<char *> &rowPtrsG,
std::vector<char *> &rowPtrsB,
char *packedAc,
char *packedDc,
const unsigned short *toLinear,
int width,
int height,
PixelType typeR,
PixelType typeG,
PixelType typeB)
:
LossyDctDecoderBase(packedAc, packedDc, toLinear, width, height)
{
_rowPtrs.push_back(rowPtrsR);
_rowPtrs.push_back(rowPtrsG);
_rowPtrs.push_back(rowPtrsB);
_type.push_back(typeR);
_type.push_back(typeG);
_type.push_back(typeB);
}
virtual ~LossyDctDecoderCsc () {}
};
//
// Base class for encoding using the lossy DCT scheme
//
class DwaCompressor::LossyDctEncoderBase
{
public:
LossyDctEncoderBase
(float quantBaseError,
char *packedAc,
char *packedDc,
const unsigned short *toNonlinear,
int width,
int height);
virtual ~LossyDctEncoderBase ();
void execute ();
//
// These return number of items, not bytes. Each item
// is an unsigned short
//
int numAcValuesEncoded () const {return _numAcComp;}
int numDcValuesEncoded () const {return _numDcComp;}
protected:
void toZigZag (half *dst, half *src);
int countSetBits (unsigned short src);
half quantize (half src, float errorTolerance);
void rleAc (half *block, unsigned short *&acPtr);
float _quantBaseError;
int _width,
_height;
const unsigned short *_toNonlinear;
int _numAcComp,
_numDcComp;
std::vector< std::vector<const char *> > _rowPtrs;
std::vector<PixelType> _type;
std::vector<SimdAlignedBuffer64f> _dctData;
//
// Pointers to the buffers where AC and DC
// DCT components should be packed for
// lossless compression downstream
//
char *_packedAc;
char *_packedDc;
//
// Our "quantization tables" - the example JPEG tables,
// normalized so that the smallest value in each is 1.0.
// This gives us a relationship between error in DCT
// components
//
float _quantTableY[64];
float _quantTableCbCr[64];
};
//
// Single channel lossy DCT encoder
//
class DwaCompressor::LossyDctEncoder: public LossyDctEncoderBase
{
public:
LossyDctEncoder
(float quantBaseError,
std::vector<const char *> &rowPtrs,
char *packedAc,
char *packedDc,
const unsigned short *toNonlinear,
int width,
int height,
PixelType type)
:
LossyDctEncoderBase
(quantBaseError, packedAc, packedDc, toNonlinear, width, height)
{
_rowPtrs.push_back(rowPtrs);
_type.push_back(type);
}
virtual ~LossyDctEncoder () {}
};
//
// RGB channel lossy DCT encoder
//
class DwaCompressor::LossyDctEncoderCsc: public LossyDctEncoderBase
{
public:
LossyDctEncoderCsc
(float quantBaseError,
std::vector<const char *> &rowPtrsR,
std::vector<const char *> &rowPtrsG,
std::vector<const char *> &rowPtrsB,
char *packedAc,
char *packedDc,
const unsigned short *toNonlinear,
int width,
int height,
PixelType typeR,
PixelType typeG,
PixelType typeB)
:
LossyDctEncoderBase
(quantBaseError, packedAc, packedDc, toNonlinear, width, height)
{
_type.push_back(typeR);
_type.push_back(typeG);
_type.push_back(typeB);
_rowPtrs.push_back(rowPtrsR);
_rowPtrs.push_back(rowPtrsG);
_rowPtrs.push_back(rowPtrsB);
}
virtual ~LossyDctEncoderCsc () {}
};
// ==============================================================
//
// LossyDctDecoderBase
//
// --------------------------------------------------------------
DwaCompressor::LossyDctDecoderBase::LossyDctDecoderBase
(char *packedAc,
char *packedDc,
const unsigned short *toLinear,
int width,
int height)
:
_isNativeXdr(false),
_packedAcCount(0),
_packedDcCount(0),
_packedAc(packedAc),
_packedDc(packedDc),
_toLinear(toLinear),
_width(width),
_height(height)
{
if (_toLinear == 0)
_toLinear = dwaCompressorNoOp;
_isNativeXdr = GLOBAL_SYSTEM_LITTLE_ENDIAN;
}
DwaCompressor::LossyDctDecoderBase::~LossyDctDecoderBase () {}
void
DwaCompressor::LossyDctDecoderBase::execute ()
{
int numComp = _rowPtrs.size();
int lastNonZero = 0;
int numBlocksX = (int) ceil ((float)_width / 8.0f);
int numBlocksY = (int) ceil ((float)_height / 8.0f);
int leftoverX = _width - (numBlocksX-1) * 8;
int leftoverY = _height - (numBlocksY-1) * 8;
int numFullBlocksX = (int)floor ((float)_width / 8.0f);
unsigned short tmpShortNative = 0;
unsigned short tmpShortXdr = 0;
const char *tmpConstCharPtr = 0;
unsigned short *currAcComp = (unsigned short *)_packedAc;
std::vector<unsigned short *> currDcComp (_rowPtrs.size());
std::vector<SimdAlignedBuffer64us> halfZigBlock (_rowPtrs.size());
if (_type.size() != _rowPtrs.size())
throw Iex::BaseExc ("Row pointers and types mismatch in count");
if ((_rowPtrs.size() != 3) && (_rowPtrs.size() != 1))
throw Iex::NoImplExc ("Only 1 and 3 channel encoding is supported");
_dctData.resize(numComp);
//
// Allocate a temp aligned buffer to hold a rows worth of full
// 8x8 half-float blocks
//
unsigned char *rowBlockHandle = new unsigned char
[numComp * numBlocksX * 64 * sizeof(unsigned short) + _SSE_ALIGNMENT];
unsigned short *rowBlock[3];
rowBlock[0] = (unsigned short*)rowBlockHandle;
for (int i = 0; i < _SSE_ALIGNMENT; ++i)
{
if (((size_t)(rowBlockHandle + i) & _SSE_ALIGNMENT_MASK) == 0)
rowBlock[0] = (unsigned short *)(rowBlockHandle + i);
}
for (int comp = 1; comp < numComp; ++comp)
rowBlock[comp] = rowBlock[comp - 1] + numBlocksX * 64;
//
// Pack DC components together by common plane, so we can get
// a little more out of differencing them. We'll always have
// one component per block, so we can computed offsets.
//
currDcComp[0] = (unsigned short *)_packedDc;
for (unsigned int comp = 1; comp < numComp; ++comp)
currDcComp[comp] = currDcComp[comp - 1] + numBlocksX * numBlocksY;
for (int blocky = 0; blocky < numBlocksY; ++blocky)
{
int maxY = 8;
if (blocky == numBlocksY-1)
maxY = leftoverY;
int maxX = 8;
for (int blockx = 0; blockx < numBlocksX; ++blockx)
{
if (blockx == numBlocksX-1)
maxX = leftoverX;
//
// If we can detect that the block is constant values
// (all components only have DC values, and all AC is 0),
// we can do everything only on 1 value, instead of all
// 64.
//
// This won't really help for regular images, but it is
// meant more for layers with large swaths of black
//
bool blockIsConstant = true;
for (unsigned int comp = 0; comp < numComp; ++comp)
{
//
// DC component is stored separately
//
#ifdef IMF_HAVE_SSE2
{
__m128i *dst = (__m128i*)halfZigBlock[comp]._buffer;
dst[7] = _mm_setzero_si128();
dst[6] = _mm_setzero_si128();
dst[5] = _mm_setzero_si128();
dst[4] = _mm_setzero_si128();
dst[3] = _mm_setzero_si128();
dst[2] = _mm_setzero_si128();
dst[1] = _mm_setzero_si128();
dst[0] = _mm_insert_epi16
(_mm_setzero_si128(), *currDcComp[comp]++, 0);
}
#else /* IMF_HAVE_SSE2 */
memset (halfZigBlock[comp]._buffer, 0, 64 * 2);
halfZigBlock[comp]._buffer[0] = *currDcComp[comp]++;
#endif /* IMF_HAVE_SSE2 */
_packedDcCount++;
//
// UnRLE the AC. This will modify currAcComp
//
lastNonZero = unRleAc (currAcComp, halfZigBlock[comp]._buffer);
//
// Convert from XDR to NATIVE
//
if (!_isNativeXdr)
{
for (int i = 0; i < 64; ++i)
{
tmpShortXdr = halfZigBlock[comp]._buffer[i];
tmpConstCharPtr = (const char *)&tmpShortXdr;
Xdr::read<CharPtrIO> (tmpConstCharPtr, tmpShortNative);
halfZigBlock[comp]._buffer[i] = tmpShortNative;
}
}
if (lastNonZero == 0)
{
//
// DC only case - AC components are all 0
//
half h;
h.setBits (halfZigBlock[comp]._buffer[0]);
_dctData[comp]._buffer[0] = (float)h;
dctInverse8x8DcOnly (_dctData[comp]._buffer);
}
else
{
//
// We have some AC components that are non-zero.
// Can't use the 'constant block' optimization
//
blockIsConstant = false;
//
// Un-Zig zag
//
(*fromHalfZigZag)
(halfZigBlock[comp]._buffer, _dctData[comp]._buffer);
//
// Zig-Zag indices in normal layout are as follows:
//
// 0 1 5 6 14 15 27 28
// 2 4 7 13 16 26 29 42
// 3 8 12 17 25 30 41 43
// 9 11 18 24 31 40 44 53
// 10 19 23 32 39 45 52 54
// 20 22 33 38 46 51 55 60
// 21 34 37 47 50 56 59 61
// 35 36 48 49 57 58 62 63
//
// If lastNonZero is less than the first item on
// each row, we know that the whole row is zero and
// can be skipped in the row-oriented part of the
// iDCT.
//
// The unrolled logic here is:
//
// if lastNonZero < rowStartIdx[i],
// zeroedRows = rowsEmpty[i]
//
// where:
//
// const int rowStartIdx[] = {2, 3, 9, 10, 20, 21, 35};
// const int rowsEmpty[] = {7, 6, 5, 4, 3, 2, 1};
//
if (lastNonZero < 2)
dctInverse8x8_7(_dctData[comp]._buffer);
else if (lastNonZero < 3)
dctInverse8x8_6(_dctData[comp]._buffer);
else if (lastNonZero < 9)
dctInverse8x8_5(_dctData[comp]._buffer);
else if (lastNonZero < 10)
dctInverse8x8_4(_dctData[comp]._buffer);
else if (lastNonZero < 20)
dctInverse8x8_3(_dctData[comp]._buffer);
else if (lastNonZero < 21)
dctInverse8x8_2(_dctData[comp]._buffer);
else if (lastNonZero < 35)
dctInverse8x8_1(_dctData[comp]._buffer);
else
dctInverse8x8_0(_dctData[comp]._buffer);
}
}
//
// Perform the CSC
//
if (numComp == 3)
{
if (!blockIsConstant)
{
csc709Inverse64 (_dctData[0]._buffer,
_dctData[1]._buffer,
_dctData[2]._buffer);
}
else
{
csc709Inverse (_dctData[0]._buffer[0],
_dctData[1]._buffer[0],
_dctData[2]._buffer[0]);
}
}
//
// Float -> Half conversion.
//
// If the block has a constant value, just convert the first pixel.
//
for (unsigned int comp = 0; comp < numComp; ++comp)
{
if (!blockIsConstant)
{
(*convertFloatToHalf64)
(&rowBlock[comp][blockx*64], _dctData[comp]._buffer);
}
else
{
#if IMF_HAVE_SSE2
__m128i *dst = (__m128i*)&rowBlock[comp][blockx*64];
dst[0] = _mm_set1_epi16
(((half)_dctData[comp]._buffer[0]).bits());
dst[1] = dst[0];
dst[2] = dst[0];
dst[3] = dst[0];
dst[4] = dst[0];
dst[5] = dst[0];
dst[6] = dst[0];
dst[7] = dst[0];
#else /* IMF_HAVE_SSE2 */
unsigned short *dst = &rowBlock[comp][blockx*64];
dst[0] = ((half)_dctData[comp]._buffer[0]).bits();
for (int i = 1; i < 64; ++i)
{
dst[i] = dst[0];
}
#endif /* IMF_HAVE_SSE2 */
} // blockIsConstant
} // comp
} // blockx
//
// At this point, we have half-float nonlinear value blocked
// in rowBlock[][]. We need to unblock the data, transfer
// back to linear, and write the results in the _rowPtrs[].
//
// There is a fast-path for aligned rows, which helps
// things a little. Since this fast path is only valid
// for full 8-element wide blocks, the partial x blocks
// are broken into a separate loop below.
//
// At the moment, the fast path requires:
// * sse support
// * aligned row pointers
// * full 8-element wide blocks
//
for (int comp = 0; comp < numComp; ++comp)
{
//
// Test if we can use the fast path
//
#ifdef IMF_HAVE_SSE2
bool fastPath = true;
for (int y = 8 * blocky; y < 8 * blocky + maxY; ++y)
{
if ((size_t)_rowPtrs[comp][y] & _SSE_ALIGNMENT_MASK)
fastPath = false;
}
if (fastPath)
{
//
// Handle all the full X blocks, in a fast path with sse2 and
// aligned row pointers
//
for (int y=8*blocky; y<8*blocky+maxY; ++y)
{
__m128i *dst = (__m128i *)_rowPtrs[comp][y];
__m128i *src = (__m128i *)&rowBlock[comp][(y & 0x7) * 8];
for (int blockx = 0; blockx < numFullBlocksX; ++blockx)
{
//
// These may need some twiddling.
// Run with multiples of 8
//
_mm_prefetch ((char *)(src + 16), _MM_HINT_NTA);
unsigned short i0 = _mm_extract_epi16 (*src, 0);
unsigned short i1 = _mm_extract_epi16 (*src, 1);
unsigned short i2 = _mm_extract_epi16 (*src, 2);
unsigned short i3 = _mm_extract_epi16 (*src, 3);
unsigned short i4 = _mm_extract_epi16 (*src, 4);
unsigned short i5 = _mm_extract_epi16 (*src, 5);
unsigned short i6 = _mm_extract_epi16 (*src, 6);
unsigned short i7 = _mm_extract_epi16 (*src, 7);
i0 = _toLinear[i0];
i1 = _toLinear[i1];
i2 = _toLinear[i2];
i3 = _toLinear[i3];
i4 = _toLinear[i4];
i5 = _toLinear[i5];
i6 = _toLinear[i6];
i7 = _toLinear[i7];
*dst = _mm_insert_epi16 (_mm_setzero_si128(), i0, 0);
*dst = _mm_insert_epi16 (*dst, i1, 1);
*dst = _mm_insert_epi16 (*dst, i2, 2);
*dst = _mm_insert_epi16 (*dst, i3, 3);
*dst = _mm_insert_epi16 (*dst, i4, 4);
*dst = _mm_insert_epi16 (*dst, i5, 5);
*dst = _mm_insert_epi16 (*dst, i6, 6);
*dst = _mm_insert_epi16 (*dst, i7, 7);
src += 8;
dst++;
}
}
}
else
{
#endif /* IMF_HAVE_SSE2 */
//
// Basic scalar kinda slow path for handling the full X blocks
//
for (int y = 8 * blocky; y < 8 * blocky + maxY; ++y)
{
unsigned short *dst = (unsigned short *)_rowPtrs[comp][y];
for (int blockx = 0; blockx < numFullBlocksX; ++blockx)
{
unsigned short *src =
&rowBlock[comp][blockx * 64 + ((y & 0x7) * 8)];
dst[0] = _toLinear[src[0]];
dst[1] = _toLinear[src[1]];
dst[2] = _toLinear[src[2]];
dst[3] = _toLinear[src[3]];
dst[4] = _toLinear[src[4]];
dst[5] = _toLinear[src[5]];
dst[6] = _toLinear[src[6]];
dst[7] = _toLinear[src[7]];
dst += 8;
}
}
#ifdef IMF_HAVE_SSE2
}
#endif /* IMF_HAVE_SSE2 */
//
// If we have partial X blocks, deal with all those now
// Since this should be minimal work, there currently
// is only one path that should work for everyone.
//
if (numFullBlocksX != numBlocksX)
{
for (int y = 8 * blocky; y < 8 * blocky + maxY; ++y)
{
unsigned short *src = (unsigned short *)
&rowBlock[comp][numFullBlocksX * 64 + ((y & 0x7) * 8)];
unsigned short *dst = (unsigned short *)_rowPtrs[comp][y];
dst += 8 * numFullBlocksX;
for (int x = 0; x < maxX; ++x)
{
*dst++ = _toLinear[*src++];
}
}
}
} // comp
} // blocky
//
// Walk over all the channels that are of type FLOAT.
// Convert from HALF XDR back to FLOAT XDR.
//
for (unsigned int chan = 0; chan < numComp; ++chan)
{
if (_type[chan] != FLOAT)
continue;
std::vector<unsigned short> halfXdr (_width);
for (int y=0; y<_height; ++y)
{
char *floatXdrPtr = _rowPtrs[chan][y];
memcpy(&halfXdr[0], floatXdrPtr, _width*sizeof(unsigned short));
const char *halfXdrPtr = (const char *)(&halfXdr[0]);
for (int x=0; x<_width; ++x)
{
half tmpHalf;
Xdr::read<CharPtrIO> (halfXdrPtr, tmpHalf);
Xdr::write<CharPtrIO> (floatXdrPtr, (float)tmpHalf);
//
// Xdr::write and Xdr::read will advance the ptrs
//
}
}
}
delete[] rowBlockHandle;
}
//
// Un-RLE the packed AC components into
// a half buffer. The half block should
// be the full 8x8 block (in zig-zag order
// still), not the first AC component.
//
// currAcComp is advanced as bytes are decoded.
//
// This returns the index of the last non-zero
// value in the buffer - with the index into zig zag
// order data. If we return 0, we have DC only data.
//
// This is assuminging that halfZigBlock is zero'ed
// prior to calling
//
int
DwaCompressor::LossyDctDecoderBase::unRleAc
(unsigned short *&currAcComp,
unsigned short *halfZigBlock)
{
//
// Un-RLE the RLE'd blocks. If we find an item whose
// high byte is 0xff, then insert the number of 0's
// as indicated by the low byte.
//
// Otherwise, just copy the number verbaitm.
//
int lastNonZero = 0;
int dctComp = 1;
//
// Start with a zero'ed block, so we don't have to
// write when we hit a run symbol
//
while (dctComp < 64)
{
if (*currAcComp == 0xff00)
{
//
// End of block
//
dctComp = 64;
}
else if ((*currAcComp) >> 8 == 0xff)
{
//
// Run detected! Insert 0's.
//
// Since the block has been zeroed, just advance the ptr
//
dctComp += (*currAcComp) & 0xff;
}
else
{
//
// Not a run, just copy over the value
//
lastNonZero = dctComp;
halfZigBlock[dctComp] = *currAcComp;
dctComp++;
}
_packedAcCount++;
currAcComp++;
}
return lastNonZero;
}
// ==============================================================
//
// LossyDctEncoderBase
//
// --------------------------------------------------------------
DwaCompressor::LossyDctEncoderBase::LossyDctEncoderBase
(float quantBaseError,
char *packedAc,
char *packedDc,
const unsigned short *toNonlinear,
int width,
int height)
:
_quantBaseError(quantBaseError),
_width(width),
_height(height),
_toNonlinear(toNonlinear),
_numAcComp(0),
_numDcComp(0),
_packedAc(packedAc),
_packedDc(packedDc)
{
//
// Here, we take the generic JPEG quantization tables and
// normalize them by the smallest component in each table.
// This gives us a relationship amongst the DCT components,
// in terms of how sensitive each component is to
// error.
//
// A higher normalized value means we can quantize more,
// and a small normalized value means we can quantize less.
//
// Eventually, we will want an acceptable quantization
// error range for each component. We find this by
// multiplying some user-specified level (_quantBaseError)
// by the normalized table (_quantTableY, _quantTableCbCr) to
// find the acceptable quantization error range.
//
// The quantization table is not needed for decoding, and
// is not transmitted. So, if you want to get really fancy,
// you could derive some content-dependent quantization
// table, and the decoder would not need to be changed. But,
// for now, we'll just use statice quantization tables.
//
int jpegQuantTableY[] =
{
16, 11, 10, 16, 24, 40, 51, 61,
12, 12, 14, 19, 26, 58, 60, 55,
14, 13, 16, 24, 40, 57, 69, 56,
14, 17, 22, 29, 51, 87, 80, 62,
18, 22, 37, 56, 68, 109, 103, 77,
24, 35, 55, 64, 81, 104, 113, 92,
49, 64, 78, 87, 103, 121, 120, 101,
72, 92, 95, 98, 112, 100, 103, 99
};
int jpegQuantTableYMin = 10;
int jpegQuantTableCbCr[] =
{
17, 18, 24, 47, 99, 99, 99, 99,
18, 21, 26, 66, 99, 99, 99, 99,
24, 26, 56, 99, 99, 99, 99, 99,
47, 66, 99, 99, 99, 99, 99, 99,
99, 99, 99, 99, 99, 99, 99, 99,
99, 99, 99, 99, 99, 99, 99, 99,
99, 99, 99, 99, 99, 99, 99, 99,
99, 99, 99, 99, 99, 99, 99, 99
};
int jpegQuantTableCbCrMin = 17;
for (int idx = 0; idx < 64; ++idx)
{
_quantTableY[idx] = static_cast<float> (jpegQuantTableY[idx]) /
static_cast<float> (jpegQuantTableYMin);
_quantTableCbCr[idx] = static_cast<float> (jpegQuantTableCbCr[idx]) /
static_cast<float> (jpegQuantTableCbCrMin);
}
if (_quantBaseError < 0)
quantBaseError = 0;
}
DwaCompressor::LossyDctEncoderBase::~LossyDctEncoderBase ()
{
}
//
// Given three channels of source data, encoding by first applying
// a color space conversion to a YCbCr space. Otherwise, if we only
// have one channel, just encode it as is.
//
// Other numbers of channels are somewhat unexpected at this point,
// and will throw an exception.
//
void
DwaCompressor::LossyDctEncoderBase::execute ()
{
int numBlocksX = (int)ceil ((float)_width / 8.0f);
int numBlocksY = (int)ceil ((float)_height/ 8.0f);
half halfZigCoef[64];
half halfCoef[64];
std::vector<unsigned short *> currDcComp (_rowPtrs.size());
unsigned short *currAcComp = (unsigned short *)_packedAc;
_dctData.resize (_rowPtrs.size());
_numAcComp = 0;
_numDcComp = 0;
assert (_type.size() == _rowPtrs.size());
assert ((_rowPtrs.size() == 3) || (_rowPtrs.size() == 1));
//
// Allocate a temp half buffer to quantize into for
// any FLOAT source channels.
//
int tmpHalfBufferElements = 0;
for (unsigned int chan = 0; chan < _rowPtrs.size(); ++chan)
if (_type[chan] == FLOAT)
tmpHalfBufferElements += _width * _height;
std::vector<unsigned short> tmpHalfBuffer (tmpHalfBufferElements);
char *tmpHalfBufferPtr = 0;
if (tmpHalfBufferElements)
tmpHalfBufferPtr = (char *)&tmpHalfBuffer[0];
//
// Run over all the float scanlines, quantizing,
// and re-assigning _rowPtr[y]. We need to translate
// FLOAT XDR to HALF XDR.
//
for (unsigned int chan = 0; chan < _rowPtrs.size(); ++chan)
{
if (_type[chan] != FLOAT)
continue;
for (int y = 0; y < _height; ++y)
{
float src = 0;
const char *srcXdr = _rowPtrs[chan][y];
char *dstXdr = tmpHalfBufferPtr;
for (int x = 0; x < _width; ++x)
{
Xdr::read<CharPtrIO> (srcXdr, src);
//
// Clamp to half ranges, instead of just casting. This
// avoids introducing Infs which end up getting zeroed later
//
src = std::max (
std::min ((float) std::numeric_limits<half>::max(), src),
(float)-std::numeric_limits<half>::max());
Xdr::write<CharPtrIO> (dstXdr, ((half)src).bits());
//
// Xdr::read and Xdr::write will advance the ptr
//
}
_rowPtrs[chan][y] = (const char *)tmpHalfBufferPtr;
tmpHalfBufferPtr += _width * sizeof (unsigned short);
}
}
//
// Pack DC components together by common plane, so we can get
// a little more out of differencing them. We'll always have
// one component per block, so we can computed offsets.
//
currDcComp[0] = (unsigned short *)_packedDc;
for (unsigned int chan = 1; chan < _rowPtrs.size(); ++chan)
currDcComp[chan] = currDcComp[chan-1] + numBlocksX * numBlocksY;
for (int blocky = 0; blocky < numBlocksY; ++blocky)
{
for (int blockx = 0; blockx < numBlocksX; ++blockx)
{
half h;
unsigned short tmpShortXdr, tmpShortNative;
char *tmpCharPtr;
for (unsigned int chan = 0; chan < _rowPtrs.size(); ++chan)
{
//
// Break the source into 8x8 blocks. If we don't
// fit at the edges, mirror.
//
// Also, convert from linear to nonlinear representation.
// Our source is assumed to be XDR, and we need to convert
// to NATIVE prior to converting to float.
//
// If we're converting linear -> nonlinear, assume that the
// XDR -> NATIVE conversion is built into the lookup. Otherwise,
// we'll need to explicitly do it.
//
for (int y = 0; y < 8; ++y)
{
for (int x = 0; x < 8; ++x)
{
int vx = 8 * blockx + x;
int vy = 8 * blocky + y;
if (vx >= _width)
vx = _width - (vx - (_width - 1));
if (vx < 0) vx = _width-1;
if (vy >=_height)
vy = _height - (vy - (_height - 1));
if (vy < 0) vy = _height-1;
tmpShortXdr =
((const unsigned short *)(_rowPtrs[chan])[vy])[vx];
if (_toNonlinear)
{
h.setBits (_toNonlinear[tmpShortXdr]);
}
else
{
const char *tmpConstCharPtr =
(const char *)(&tmpShortXdr);
Xdr::read<CharPtrIO>
(tmpConstCharPtr, tmpShortNative);
h.setBits(tmpShortNative);
}
_dctData[chan]._buffer[y * 8 + x] = (float)h;
} // x
} // y
} // chan
//
// Color space conversion
//
if (_rowPtrs.size() == 3)
{
csc709Forward64 (_dctData[0]._buffer,
_dctData[1]._buffer,
_dctData[2]._buffer);
}
for (unsigned int chan = 0; chan < _rowPtrs.size(); ++chan)
{
//
// Forward DCT
//
dctForward8x8(_dctData[chan]._buffer);
//
// Quantize to half, and zigzag
//
if (chan == 0)
{
for (int i = 0; i < 64; ++i)
{
halfCoef[i] =
quantize ((half)_dctData[chan]._buffer[i],
_quantBaseError*_quantTableY[i]);
}
}
else
{
for (int i = 0; i < 64; ++i)
{
halfCoef[i] =
quantize ((half)_dctData[chan]._buffer[i],
_quantBaseError*_quantTableCbCr[i]);
}
}
toZigZag (halfZigCoef, halfCoef);
//
// Convert from NATIVE back to XDR, before we write out
//
for (int i = 0; i < 64; ++i)
{
tmpCharPtr = (char *)&tmpShortXdr;
Xdr::write<CharPtrIO>(tmpCharPtr, halfZigCoef[i].bits());
halfZigCoef[i].setBits(tmpShortXdr);
}
//
// Save the DC component separately, to be compressed on
// its own.
//
*currDcComp[chan]++ = halfZigCoef[0].bits();
_numDcComp++;
//
// Then RLE the AC components (which will record the count
// of the resulting number of items)
//
rleAc (halfZigCoef, currAcComp);
} // chan
} // blockx
} // blocky
}
//
// Reorder from zig-zag order to normal ordering
//
void
DwaCompressor::LossyDctEncoderBase::toZigZag (half *dst, half *src)
{
const int remap[] =
{
0,
1, 8,
16, 9, 2,
3, 10, 17, 24,
32, 25, 18, 11, 4,
5, 12, 19, 26, 33, 40,
48, 41, 34, 27, 20, 13, 6,
7, 14, 21, 28, 35, 42, 49, 56,
57, 50, 43, 36, 29, 22, 15,
23, 30, 37, 44, 51, 58,
59, 52, 45, 38, 31,
39, 46, 53, 60,
61, 54, 47,
55, 62,
63
};
for (int i=0; i<64; ++i)
dst[i] = src[remap[i]];
}
//
// Precomputing the bit count runs faster than using
// the builtin instruction, at least in one case..
//
// Precomputing 8-bits is no slower than 16-bits,
// and saves a fair bit of overhead..
//
int
DwaCompressor::LossyDctEncoderBase::countSetBits (unsigned short src)
{
static const unsigned short numBitsSet[256] =
{
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
};
return numBitsSet[src & 0xff] + numBitsSet[src >> 8];
}
//
// Take a DCT coefficient, as well as an acceptable error. Search
// nearby values within the error tolerance, that have fewer
// bits set.
//
// The list of candidates has been pre-computed and sorted
// in order of increasing numbers of bits set. This way, we
// can stop searching as soon as we find a candidate that
// is within the error tolerance.
//
half
DwaCompressor::LossyDctEncoderBase::quantize (half src, float errorTolerance)
{
half tmp;
float srcFloat = (float)src;
int numSetBits = countSetBits(src.bits());
const unsigned short *closest = closestData + closestDataOffset[src.bits()];
for (int targetNumSetBits = numSetBits - 1;
targetNumSetBits >= 0;
--targetNumSetBits)
{
tmp.setBits (*closest);
if (fabs ((float)tmp - srcFloat) < errorTolerance)
return tmp;
closest++;
}
return src;
}
//
// RLE the zig-zag of the AC components + copy over
// into another tmp buffer
//
// Try to do a simple RLE scheme to reduce run's of 0's. This
// differs from the jpeg EOB case, since EOB just indicates that
// the rest of the block is zero. In our case, we have lots of
// NaN symbols, which shouldn't be allowed to occur in DCT
// coefficents - so we'll use them for encoding runs.
//
// If the high byte is 0xff, then we have a run of 0's, of length
// given by the low byte. For example, 0xff03 would be a run
// of 3 0's, starting at the current location.
//
// block is our block of 64 coefficients
// acPtr a pointer to back the RLE'd values into.
//
// This will advance the counter, _numAcComp.
//
void
DwaCompressor::LossyDctEncoderBase::rleAc
(half *block,
unsigned short *&acPtr)
{
int dctComp = 1;
unsigned short rleSymbol = 0x0;
while (dctComp < 64)
{
int runLen = 1;
//
// If we don't have a 0, output verbatim
//
if (block[dctComp].bits() != rleSymbol)
{
*acPtr++ = block[dctComp].bits();
_numAcComp++;
dctComp += runLen;
continue;
}
//
// We're sitting on a 0, so see how big the run is.
//
while ((dctComp+runLen < 64) &&
(block[dctComp+runLen].bits() == rleSymbol))
{
runLen++;
}
//
// If the run len is too small, just output verbatim
// otherwise output our run token
//
// Originally, we wouldn't have a separate symbol for
// "end of block". But in some experimentation, it looks
// like using 0xff00 for "end of block" can save a bit
// of space.
//
if (runLen == 1)
{
runLen = 1;
*acPtr++ = block[dctComp].bits();
_numAcComp++;
//
// Using 0xff00 for "end of block"
//
}
else if (runLen + dctComp == 64)
{
//
// Signal EOB
//
*acPtr++ = 0xff00;
_numAcComp++;
}
else
{
//
// Signal normal run
//
*acPtr++ = 0xff00 | runLen;
_numAcComp++;
}
//
// Advance by runLen
//
dctComp += runLen;
}
}
// ==============================================================
//
// DwaCompressor
//
// --------------------------------------------------------------
//
// DwaCompressor()
//
DwaCompressor::DwaCompressor
(const Header &hdr,
int maxScanLineSize,
int numScanLines,
AcCompression acCompression)
:
Compressor(hdr),
_acCompression(acCompression),
_maxScanLineSize(maxScanLineSize),
_numScanLines(numScanLines),
_channels(hdr.channels()),
_packedAcBuffer(0),
_packedAcBufferSize(0),
_packedDcBuffer(0),
_packedDcBufferSize(0),
_rleBuffer(0),
_rleBufferSize(0),
_outBuffer(0),
_outBufferSize(0),
_zip(0),
_dwaCompressionLevel(45.0)
{
_min[0] = hdr.dataWindow().min.x;
_min[1] = hdr.dataWindow().min.y;
_max[0] = hdr.dataWindow().max.x;
_max[1] = hdr.dataWindow().max.y;
for (int i=0; i < NUM_COMPRESSOR_SCHEMES; ++i)
{
_planarUncBuffer[i] = 0;
_planarUncBufferSize[i] = 0;
}
//
// Check the header for a quality attribute
//
if (hasDwaCompressionLevel (hdr))
_dwaCompressionLevel = dwaCompressionLevel (hdr);
}
DwaCompressor::~DwaCompressor()
{
delete[] _packedAcBuffer;
delete[] _packedDcBuffer;
delete[] _rleBuffer;
delete[] _outBuffer;
delete _zip;
for (int i=0; i<NUM_COMPRESSOR_SCHEMES; ++i)
delete[] _planarUncBuffer[i];
}
int
DwaCompressor::numScanLines() const
{
return _numScanLines;
}
Imf::Compressor::Format
DwaCompressor::format() const
{
if (GLOBAL_SYSTEM_LITTLE_ENDIAN)
return NATIVE;
else
return XDR;
}
int
DwaCompressor::compress
(const char *inPtr,
int inSize,
int minY,
const char *&outPtr)
{
return compress
(inPtr,
inSize,
Imath::Box2i (Imath::V2i (_min[0], minY),
Imath::V2i (_max[0], minY + numScanLines() - 1)),
outPtr);
}
int
DwaCompressor::compressTile
(const char *inPtr,
int inSize,
Imath::Box2i range,
const char *&outPtr)
{
return compress (inPtr, inSize, range, outPtr);
}
int
DwaCompressor::compress
(const char *inPtr,
int inSize,
Imath::Box2i range,
const char *&outPtr)
{
const char *inDataPtr = inPtr;
char *packedAcEnd = 0;
char *packedDcEnd = 0;
int fileVersion = 2; // Starting with 2, we write the channel
// classification rules into the file
if (fileVersion < 2)
initializeLegacyChannelRules();
else
initializeDefaultChannelRules();
size_t outBufferSize = 0;
initializeBuffers(outBufferSize);
unsigned short channelRuleSize = 0;
std::vector<Classifier> channelRules;
if (fileVersion >= 2)
{
relevantChannelRules(channelRules);
channelRuleSize = Xdr::size<unsigned short>();
for (size_t i = 0; i < channelRules.size(); ++i)
channelRuleSize += channelRules[i].size();
}
//
// Remember to allocate _outBuffer, if we haven't done so already.
//
outBufferSize += channelRuleSize;
if (outBufferSize > _outBufferSize)
{
_outBufferSize = outBufferSize;
if (_outBuffer != 0)
delete[] _outBuffer;
_outBuffer = new char[outBufferSize];
}
char *outDataPtr = &_outBuffer[NUM_SIZES_SINGLE * sizeof(Imf::Int64) +
channelRuleSize];
//
// We might not be dealing with any color data, in which
// case the AC buffer size will be 0, and deferencing
// a vector will not be a good thing to do.
//
if (_packedAcBuffer)
packedAcEnd = _packedAcBuffer;
if (_packedDcBuffer)
packedDcEnd = _packedDcBuffer;
#define OBIDX(x) (Int64 *)&_outBuffer[x * sizeof (Int64)]
Int64 *version = OBIDX (VERSION);
Int64 *unknownUncompressedSize = OBIDX (UNKNOWN_UNCOMPRESSED_SIZE);
Int64 *unknownCompressedSize = OBIDX (UNKNOWN_COMPRESSED_SIZE);
Int64 *acCompressedSize = OBIDX (AC_COMPRESSED_SIZE);
Int64 *dcCompressedSize = OBIDX (DC_COMPRESSED_SIZE);
Int64 *rleCompressedSize = OBIDX (RLE_COMPRESSED_SIZE);
Int64 *rleUncompressedSize = OBIDX (RLE_UNCOMPRESSED_SIZE);
Int64 *rleRawSize = OBIDX (RLE_RAW_SIZE);
Int64 *totalAcUncompressedCount = OBIDX (AC_UNCOMPRESSED_COUNT);
Int64 *totalDcUncompressedCount = OBIDX (DC_UNCOMPRESSED_COUNT);
Int64 *acCompression = OBIDX (AC_COMPRESSION);
int minX = range.min.x;
int maxX = std::min(range.max.x, _max[0]);
int minY = range.min.y;
int maxY = std::min(range.max.y, _max[1]);
//
// Zero all the numbers in the chunk header
//
memset (_outBuffer, 0, NUM_SIZES_SINGLE * sizeof (Int64));
//
// Setup the AC compression strategy and the version in the data block,
// then write the relevant channel classification rules if needed
//
*version = fileVersion;
*acCompression = _acCompression;
setupChannelData (minX, minY, maxX, maxY);
if (fileVersion >= 2)
{
char *writePtr = &_outBuffer[NUM_SIZES_SINGLE * sizeof(Imf::Int64)];
Xdr::write<CharPtrIO> (writePtr, channelRuleSize);
for (size_t i = 0; i < channelRules.size(); ++i)
channelRules[i].write(writePtr);
}
//
// Determine the start of each row in the input buffer
// Channels are interleaved by scanline
//
std::vector<bool> encodedChannels (_channelData.size());
std::vector< std::vector<const char *> > rowPtrs (_channelData.size());
for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
encodedChannels[chan] = false;
inDataPtr = inPtr;
for (int y = minY; y <= maxY; ++y)
{
for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
{
ChannelData *cd = &_channelData[chan];
if (Imath::modp(y, cd->ySampling) != 0)
continue;
rowPtrs[chan].push_back(inDataPtr);
inDataPtr += cd->width * Imf::pixelTypeSize(cd->type);
}
}
inDataPtr = inPtr;
//
// Make a pass over all our CSC sets and try to encode them first
//
for (unsigned int csc = 0; csc < _cscSets.size(); ++csc)
{
LossyDctEncoderCsc encoder
(_dwaCompressionLevel / 100000.f,
rowPtrs[_cscSets[csc].idx[0]],
rowPtrs[_cscSets[csc].idx[1]],
rowPtrs[_cscSets[csc].idx[2]],
packedAcEnd,
packedDcEnd,
dwaCompressorToNonlinear,
_channelData[_cscSets[csc].idx[0]].width,
_channelData[_cscSets[csc].idx[0]].height,
_channelData[_cscSets[csc].idx[0]].type,
_channelData[_cscSets[csc].idx[1]].type,
_channelData[_cscSets[csc].idx[2]].type);
encoder.execute();
*totalAcUncompressedCount += encoder.numAcValuesEncoded();
*totalDcUncompressedCount += encoder.numDcValuesEncoded();
packedAcEnd += encoder.numAcValuesEncoded() * sizeof(unsigned short);
packedDcEnd += encoder.numDcValuesEncoded() * sizeof(unsigned short);
encodedChannels[_cscSets[csc].idx[0]] = true;
encodedChannels[_cscSets[csc].idx[1]] = true;
encodedChannels[_cscSets[csc].idx[2]] = true;
}
for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
{
ChannelData *cd = &_channelData[chan];
if (encodedChannels[chan])
continue;
switch (cd->compression)
{
case LOSSY_DCT:
//
// For LOSSY_DCT, treat this just like the CSC'd case,
// but only operate on one channel
//
{
const unsigned short *nonlinearLut = 0;
if (!cd->pLinear)
nonlinearLut = dwaCompressorToNonlinear;
LossyDctEncoder encoder
(_dwaCompressionLevel / 100000.f,
rowPtrs[chan],
packedAcEnd,
packedDcEnd,
nonlinearLut,
cd->width,
cd->height,
cd->type);
encoder.execute();
*totalAcUncompressedCount += encoder.numAcValuesEncoded();
*totalDcUncompressedCount += encoder.numDcValuesEncoded();
packedAcEnd +=
encoder.numAcValuesEncoded() * sizeof (unsigned short);
packedDcEnd +=
encoder.numDcValuesEncoded() * sizeof (unsigned short);
}
break;
case RLE:
//
// For RLE, bash the bytes up so that the first bytes of each
// pixel are contingous, as are the second bytes, and so on.
//
for (unsigned int y = 0; y < rowPtrs[chan].size(); ++y)
{
const char *row = rowPtrs[chan][y];
for (int x = 0; x < cd->width; ++x)
{
for (int byte = 0;
byte < Imf::pixelTypeSize (cd->type);
++byte)
{
*cd->planarUncRleEnd[byte]++ = *row++;
}
}
*rleRawSize += cd->width * Imf::pixelTypeSize(cd->type);
}
break;
case UNKNOWN:
//
// Otherwise, just copy data over verbatim
//
{
int scanlineSize = cd->width * Imf::pixelTypeSize(cd->type);
for (unsigned int y = 0; y < rowPtrs[chan].size(); ++y)
{
memcpy (cd->planarUncBufferEnd,
rowPtrs[chan][y],
scanlineSize);
cd->planarUncBufferEnd += scanlineSize;
}
*unknownUncompressedSize += cd->planarUncSize;
}
break;
default:
assert (false);
}
encodedChannels[chan] = true;
}
//
// Pack the Unknown data into the output buffer first. Instead of
// just copying it uncompressed, try zlib compression at least.
//
if (*unknownUncompressedSize > 0)
{
uLongf inSize = (uLongf)(*unknownUncompressedSize);
uLongf outSize = compressBound (inSize);
if (Z_OK != ::compress2 ((Bytef *)outDataPtr,
&outSize,
(const Bytef *)_planarUncBuffer[UNKNOWN],
inSize,
9))
{
throw Iex::BaseExc ("Data compression (zlib) failed.");
}
outDataPtr += outSize;
*unknownCompressedSize = outSize;
}
//
// Now, pack all the Lossy DCT coefficients into our output
// buffer, with Huffman encoding.
//
// Also, record the compressed size and the number of
// uncompressed componentns we have.
//
if (*totalAcUncompressedCount > 0)
{
switch (_acCompression)
{
case STATIC_HUFFMAN:
*acCompressedSize = (int)
hufCompress((unsigned short *)_packedAcBuffer,
(int)*totalAcUncompressedCount,
outDataPtr);
break;
case DEFLATE:
{
uLongf destLen = compressBound (
(*totalAcUncompressedCount) * sizeof (unsigned short));
if (Z_OK != ::compress2
((Bytef *)outDataPtr,
&destLen,
(Bytef *)_packedAcBuffer,
(uLong)(*totalAcUncompressedCount
* sizeof (unsigned short)),
9))
{
throw Iex::InputExc ("Data compression (zlib) failed.");
}
*acCompressedSize = destLen;
}
break;
default:
assert (false);
}
outDataPtr += *acCompressedSize;
}
//
// Handle the DC components separately
//
if (*totalDcUncompressedCount > 0)
{
*dcCompressedSize = _zip->compress
(_packedDcBuffer,
(int)(*totalDcUncompressedCount) * sizeof (unsigned short),
outDataPtr);
outDataPtr += *dcCompressedSize;
}
//
// If we have RLE data, first RLE encode it and set the uncompressed
// size. Then, deflate the results and set the compressed size.
//
if (*rleRawSize > 0)
{
*rleUncompressedSize = rleCompress
((int)(*rleRawSize),
_planarUncBuffer[RLE],
(signed char *)_rleBuffer);
uLongf dstLen = compressBound ((uLongf)*rleUncompressedSize);
if (Z_OK != ::compress2
((Bytef *)outDataPtr,
&dstLen,
(Bytef *)_rleBuffer,
(uLong)(*rleUncompressedSize),
9))
{
throw Iex::BaseExc ("Error compressing RLE'd data.");
}
*rleCompressedSize = dstLen;
outDataPtr += *rleCompressedSize;
}
//
// Flip the counters to XDR format
//
for (int i = 0; i < NUM_SIZES_SINGLE; ++i)
{
Int64 src = *(((Int64 *)_outBuffer) + i);
char *dst = (char *)(((Int64 *)_outBuffer) + i);
Xdr::write<CharPtrIO> (dst, src);
}
//
// We're done - compute the number of bytes we packed
//
outPtr = _outBuffer;
return static_cast<int>(outDataPtr - _outBuffer + 1);
}
int
DwaCompressor::uncompress
(const char *inPtr,
int inSize,
int minY,
const char *&outPtr)
{
return uncompress (inPtr,
inSize,
Imath::Box2i (Imath::V2i (_min[0], minY),
Imath::V2i (_max[0], minY + numScanLines() - 1)),
outPtr);
}
int
DwaCompressor::uncompressTile
(const char *inPtr,
int inSize,
Imath::Box2i range,
const char *&outPtr)
{
return uncompress (inPtr, inSize, range, outPtr);
}
int
DwaCompressor::uncompress
(const char *inPtr,
int inSize,
Imath::Box2i range,
const char *&outPtr)
{
int minX = range.min.x;
int maxX = std::min (range.max.x, _max[0]);
int minY = range.min.y;
int maxY = std::min (range.max.y, _max[1]);
int headerSize = NUM_SIZES_SINGLE*sizeof(Int64);
if (inSize < headerSize)
{
throw Iex::InputExc("Error uncompressing DWA data"
"(truncated header).");
}
//
// Flip the counters from XDR to NATIVE
//
for (int i = 0; i < NUM_SIZES_SINGLE; ++i)
{
Int64 *dst = (((Int64 *)inPtr) + i);
const char *src = (char *)(((Int64 *)inPtr) + i);
Xdr::read<CharPtrIO> (src, *dst);
}
//
// Unwind all the counter info
//
const Int64 *inPtr64 = (const Int64*) inPtr;
Int64 version = *(inPtr64 + VERSION);
Int64 unknownUncompressedSize = *(inPtr64 + UNKNOWN_UNCOMPRESSED_SIZE);
Int64 unknownCompressedSize = *(inPtr64 + UNKNOWN_COMPRESSED_SIZE);
Int64 acCompressedSize = *(inPtr64 + AC_COMPRESSED_SIZE);
Int64 dcCompressedSize = *(inPtr64 + DC_COMPRESSED_SIZE);
Int64 rleCompressedSize = *(inPtr64 + RLE_COMPRESSED_SIZE);
Int64 rleUncompressedSize = *(inPtr64 + RLE_UNCOMPRESSED_SIZE);
Int64 rleRawSize = *(inPtr64 + RLE_RAW_SIZE);
Int64 totalAcUncompressedCount = *(inPtr64 + AC_UNCOMPRESSED_COUNT);
Int64 totalDcUncompressedCount = *(inPtr64 + DC_UNCOMPRESSED_COUNT);
Int64 acCompression = *(inPtr64 + AC_COMPRESSION);
Int64 compressedSize = unknownCompressedSize +
acCompressedSize +
dcCompressedSize +
rleCompressedSize;
const char *dataPtr = inPtr + NUM_SIZES_SINGLE * sizeof(Int64);
if (inSize < headerSize + compressedSize)
{
throw Iex::InputExc("Error uncompressing DWA data"
"(truncated file).");
}
if (unknownUncompressedSize < 0 ||
unknownCompressedSize < 0 ||
acCompressedSize < 0 ||
dcCompressedSize < 0 ||
rleCompressedSize < 0 ||
rleUncompressedSize < 0 ||
rleRawSize < 0 ||
totalAcUncompressedCount < 0 ||
totalDcUncompressedCount < 0)
{
throw Iex::InputExc("Error uncompressing DWA data"
" (corrupt header).");
}
if (version < 2)
initializeLegacyChannelRules();
else
{
unsigned short ruleSize = 0;
Xdr::read<CharPtrIO>(dataPtr, ruleSize);
if (ruleSize < 0)
throw Iex::InputExc("Error uncompressing DWA data"
" (corrupt header file).");
headerSize += ruleSize;
if (inSize < headerSize + compressedSize)
throw Iex::InputExc("Error uncompressing DWA data"
" (truncated file).");
_channelRules.clear();
ruleSize -= Xdr::size<unsigned short> ();
while (ruleSize > 0)
{
Classifier rule(dataPtr, ruleSize);
_channelRules.push_back(rule);
ruleSize -= rule.size();
}
}
size_t outBufferSize = 0;
initializeBuffers(outBufferSize);
//
// Allocate _outBuffer, if we haven't done so already
//
if (_maxScanLineSize * numScanLines() > _outBufferSize)
{
_outBufferSize = _maxScanLineSize * numScanLines();
if (_outBuffer != 0)
delete[] _outBuffer;
_outBuffer = new char[_maxScanLineSize * numScanLines()];
}
char *outBufferEnd = _outBuffer;
//
// Find the start of the RLE packed AC components and
// the DC components for each channel. This will be handy
// if you want to decode the channels in parallel later on.
//
char *packedAcBufferEnd = 0;
if (_packedAcBuffer)
packedAcBufferEnd = _packedAcBuffer;
char *packedDcBufferEnd = 0;
if (_packedDcBuffer)
packedDcBufferEnd = _packedDcBuffer;
//
// UNKNOWN data is packed first, followed by the
// Huffman-compressed AC, then the DC values,
// and then the zlib compressed RLE data.
//
const char *compressedUnknownBuf = dataPtr;
const char *compressedAcBuf = compressedUnknownBuf +
static_cast<ptrdiff_t>(unknownCompressedSize);
const char *compressedDcBuf = compressedAcBuf +
static_cast<ptrdiff_t>(acCompressedSize);
const char *compressedRleBuf = compressedDcBuf +
static_cast<ptrdiff_t>(dcCompressedSize);
//
// Sanity check that the version is something we expect. Right now,
// we can decode version 0, 1, and 2. v1 adds 'end of block' symbols
// to the AC RLE. v2 adds channel classification rules at the
// start of the data block.
//
if ((version < 0) || (version > 2))
throw Iex::InputExc ("Invalid version of compressed data block");
setupChannelData(minX, minY, maxX, maxY);
//
// Uncompress the UNKNOWN data into _planarUncBuffer[UNKNOWN]
//
if (unknownCompressedSize > 0)
{
if (unknownUncompressedSize > _planarUncBufferSize[UNKNOWN])
{
throw Iex::InputExc("Error uncompressing DWA data"
"(corrupt header).");
}
uLongf outSize = (uLongf)unknownUncompressedSize;
if (Z_OK != ::uncompress
((Bytef *)_planarUncBuffer[UNKNOWN],
&outSize,
(Bytef *)compressedUnknownBuf,
(uLong)unknownCompressedSize))
{
throw Iex::BaseExc("Error uncompressing UNKNOWN data.");
}
}
//
// Uncompress the AC data into _packedAcBuffer
//
if (acCompressedSize > 0)
{
if (totalAcUncompressedCount*sizeof(unsigned short) > _packedAcBufferSize)
{
throw Iex::InputExc("Error uncompressing DWA data"
"(corrupt header).");
}
//
// Don't trust the user to get it right, look in the file.
//
switch (acCompression)
{
case STATIC_HUFFMAN:
hufUncompress
(compressedAcBuf,
(int)acCompressedSize,
(unsigned short *)_packedAcBuffer,
(int)totalAcUncompressedCount);
break;
case DEFLATE:
{
uLongf destLen =
(int)(totalAcUncompressedCount) * sizeof (unsigned short);
if (Z_OK != ::uncompress
((Bytef *)_packedAcBuffer,
&destLen,
(Bytef *)compressedAcBuf,
(uLong)acCompressedSize))
{
throw Iex::InputExc ("Data decompression (zlib) failed.");
}
if (totalAcUncompressedCount * sizeof (unsigned short) !=
destLen)
{
throw Iex::InputExc ("AC data corrupt.");
}
}
break;
default:
throw Iex::NoImplExc ("Unknown AC Compression");
break;
}
}
//
// Uncompress the DC data into _packedDcBuffer
//
if (dcCompressedSize > 0)
{
if (totalDcUncompressedCount*sizeof(unsigned short) > _packedDcBufferSize)
{
throw Iex::InputExc("Error uncompressing DWA data"
"(corrupt header).");
}
if (_zip->uncompress
(compressedDcBuf, (int)dcCompressedSize, _packedDcBuffer)
!= (int)totalDcUncompressedCount * sizeof (unsigned short))
{
throw Iex::BaseExc("DC data corrupt.");
}
}
//
// Uncompress the RLE data into _rleBuffer, then unRLE the results
// into _planarUncBuffer[RLE]
//
if (rleRawSize > 0)
{
if (rleUncompressedSize > _rleBufferSize ||
rleRawSize > _planarUncBufferSize[RLE])
{
throw Iex::InputExc("Error uncompressing DWA data"
"(corrupt header).");
}
uLongf dstLen = (uLongf)rleUncompressedSize;
if (Z_OK != ::uncompress
((Bytef *)_rleBuffer,
&dstLen,
(Bytef *)compressedRleBuf,
(uLong)rleCompressedSize))
{
throw Iex::BaseExc("Error uncompressing RLE data.");
}
if (dstLen != rleUncompressedSize)
throw Iex::BaseExc("RLE data corrupted");
if (rleUncompress
((int)rleUncompressedSize,
(int)rleRawSize,
(signed char *)_rleBuffer,
_planarUncBuffer[RLE]) != rleRawSize)
{
throw Iex::BaseExc("RLE data corrupted");
}
}
//
// Determine the start of each row in the output buffer
//
std::vector<bool> decodedChannels (_channelData.size());
std::vector< std::vector<char *> > rowPtrs (_channelData.size());
for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
decodedChannels[chan] = false;
outBufferEnd = _outBuffer;
for (int y = minY; y <= maxY; ++y)
{
for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
{
ChannelData *cd = &_channelData[chan];
if (Imath::modp (y, cd->ySampling) != 0)
continue;
rowPtrs[chan].push_back (outBufferEnd);
outBufferEnd += cd->width * Imf::pixelTypeSize (cd->type);
}
}
//
// Setup to decode each block of 3 channels that need to
// be handled together
//
for (unsigned int csc = 0; csc < _cscSets.size(); ++csc)
{
int rChan = _cscSets[csc].idx[0];
int gChan = _cscSets[csc].idx[1];
int bChan = _cscSets[csc].idx[2];
LossyDctDecoderCsc decoder
(rowPtrs[rChan],
rowPtrs[gChan],
rowPtrs[bChan],
packedAcBufferEnd,
packedDcBufferEnd,
dwaCompressorToLinear,
_channelData[rChan].width,
_channelData[rChan].height,
_channelData[rChan].type,
_channelData[gChan].type,
_channelData[bChan].type);
decoder.execute();
packedAcBufferEnd +=
decoder.numAcValuesEncoded() * sizeof (unsigned short);
packedDcBufferEnd +=
decoder.numDcValuesEncoded() * sizeof (unsigned short);
decodedChannels[rChan] = true;
decodedChannels[gChan] = true;
decodedChannels[bChan] = true;
}
//
// Setup to handle the remaining channels by themselves
//
for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
{
if (decodedChannels[chan])
continue;
ChannelData *cd = &_channelData[chan];
int pixelSize = Imf::pixelTypeSize (cd->type);
switch (cd->compression)
{
case LOSSY_DCT:
//
// Setup a single-channel lossy DCT decoder pointing
// at the output buffer
//
{
const unsigned short *linearLut = 0;
if (!cd->pLinear)
linearLut = dwaCompressorToLinear;
LossyDctDecoder decoder
(rowPtrs[chan],
packedAcBufferEnd,
packedDcBufferEnd,
linearLut,
cd->width,
cd->height,
cd->type);
decoder.execute();
packedAcBufferEnd +=
decoder.numAcValuesEncoded() * sizeof (unsigned short);
packedDcBufferEnd +=
decoder.numDcValuesEncoded() * sizeof (unsigned short);
}
break;
case RLE:
//
// For the RLE case, the data has been un-RLE'd into
// planarUncRleEnd[], but is still split out by bytes.
// We need to rearrange the bytes back into the correct
// order in the output buffer;
//
{
int row = 0;
for (int y = minY; y <= maxY; ++y)
{
if (Imath::modp (y, cd->ySampling) != 0)
continue;
char *dst = rowPtrs[chan][row];
if (pixelSize == 2)
{
interleaveByte2 (dst,
cd->planarUncRleEnd[0],
cd->planarUncRleEnd[1],
cd->width);
cd->planarUncRleEnd[0] += cd->width;
cd->planarUncRleEnd[1] += cd->width;
}
else
{
for (int x = 0; x < cd->width; ++x)
{
for (int byte = 0; byte < pixelSize; ++byte)
{
*dst++ = *cd->planarUncRleEnd[byte]++;
}
}
}
row++;
}
}
break;
case UNKNOWN:
//
// In the UNKNOWN case, data is already in planarUncBufferEnd
// and just needs to copied over to the output buffer
//
{
int row = 0;
int dstScanlineSize = cd->width * Imf::pixelTypeSize (cd->type);
for (int y = minY; y <= maxY; ++y)
{
if (Imath::modp (y, cd->ySampling) != 0)
continue;
memcpy (rowPtrs[chan][row],
cd->planarUncBufferEnd,
dstScanlineSize);
cd->planarUncBufferEnd += dstScanlineSize;
row++;
}
}
break;
default:
throw Iex::NoImplExc ("Unhandled compression scheme case");
break;
}
decodedChannels[chan] = true;
}
//
// Return a ptr to _outBuffer
//
outPtr = _outBuffer;
return (int)(outBufferEnd - _outBuffer);
}
// static
void
DwaCompressor::initializeFuncs()
{
convertFloatToHalf64 = convertFloatToHalf64_scalar;
fromHalfZigZag = fromHalfZigZag_scalar;
CpuId cpuId;
//
// Setup HALF <-> FLOAT conversion implementations
//
if (cpuId.avx && cpuId.f16c)
{
convertFloatToHalf64 = convertFloatToHalf64_f16c;
fromHalfZigZag = fromHalfZigZag_f16c;
}
//
// Setup inverse DCT implementations
//
dctInverse8x8_0 = dctInverse8x8_scalar<0>;
dctInverse8x8_1 = dctInverse8x8_scalar<1>;
dctInverse8x8_2 = dctInverse8x8_scalar<2>;
dctInverse8x8_3 = dctInverse8x8_scalar<3>;
dctInverse8x8_4 = dctInverse8x8_scalar<4>;
dctInverse8x8_5 = dctInverse8x8_scalar<5>;
dctInverse8x8_6 = dctInverse8x8_scalar<6>;
dctInverse8x8_7 = dctInverse8x8_scalar<7>;
if (cpuId.avx)
{
dctInverse8x8_0 = dctInverse8x8_avx<0>;
dctInverse8x8_1 = dctInverse8x8_avx<1>;
dctInverse8x8_2 = dctInverse8x8_avx<2>;
dctInverse8x8_3 = dctInverse8x8_avx<3>;
dctInverse8x8_4 = dctInverse8x8_avx<4>;
dctInverse8x8_5 = dctInverse8x8_avx<5>;
dctInverse8x8_6 = dctInverse8x8_avx<6>;
dctInverse8x8_7 = dctInverse8x8_avx<7>;
}
else if (cpuId.sse2)
{
dctInverse8x8_0 = dctInverse8x8_sse2<0>;
dctInverse8x8_1 = dctInverse8x8_sse2<1>;
dctInverse8x8_2 = dctInverse8x8_sse2<2>;
dctInverse8x8_3 = dctInverse8x8_sse2<3>;
dctInverse8x8_4 = dctInverse8x8_sse2<4>;
dctInverse8x8_5 = dctInverse8x8_sse2<5>;
dctInverse8x8_6 = dctInverse8x8_sse2<6>;
dctInverse8x8_7 = dctInverse8x8_sse2<7>;
}
}
//
// Handle channel classification and buffer allocation once we know
// how to classify channels
//
void
DwaCompressor::initializeBuffers (size_t &outBufferSize)
{
classifyChannels (_channels, _channelData, _cscSets);
//
// _outBuffer needs to be big enough to hold all our
// compressed data - which could vary depending on what sort
// of channels we have.
//
int maxOutBufferSize = 0;
int numLossyDctChans = 0;
int unknownBufferSize = 0;
int rleBufferSize = 0;
int maxLossyDctAcSize = (int)ceil ((float)numScanLines() / 8.0f) *
(int)ceil ((float)(_max[0] - _min[0] + 1) / 8.0f) *
63 * sizeof (unsigned short);
int maxLossyDctDcSize = (int)ceil ((float)numScanLines() / 8.0f) *
(int)ceil ((float)(_max[0] - _min[0] + 1) / 8.0f) *
sizeof (unsigned short);
for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
{
switch (_channelData[chan].compression)
{
case LOSSY_DCT:
//
// This is the size of the number of packed
// components, plus the requirements for
// maximum Huffman encoding size (for STATIC_HUFFMAN)
// or for zlib compression (for DEFLATE)
//
maxOutBufferSize += std::max(
(int)(2 * maxLossyDctAcSize + 65536),
(int)compressBound (maxLossyDctAcSize) );
numLossyDctChans++;
break;
case RLE:
{
//
// RLE, if gone horribly wrong, could double the size
// of the source data.
//
int rleAmount = 2 * numScanLines() * (_max[0] - _min[0] + 1) *
Imf::pixelTypeSize (_channelData[chan].type);
rleBufferSize += rleAmount;
}
break;
case UNKNOWN:
unknownBufferSize += numScanLines() * (_max[0] - _min[0] + 1) *
Imf::pixelTypeSize (_channelData[chan].type);
break;
default:
throw Iex::NoImplExc ("Unhandled compression scheme case");
break;
}
}
//
// Also, since the results of the RLE are packed into
// the output buffer, we need the extra room there. But
// we're going to zlib compress() the data we pack,
// which could take slightly more space
//
maxOutBufferSize += (int)compressBound ((uLongf)rleBufferSize);
//
// And the same goes for the UNKNOWN data
//
maxOutBufferSize += (int)compressBound ((uLongf)unknownBufferSize);
//
// Allocate a zip/deflate compressor big enought to hold the DC data
// and include it's compressed results in the size requirements
// for our output buffer
//
if (_zip == 0)
_zip = new Zip (maxLossyDctDcSize * numLossyDctChans);
else if (_zip->maxRawSize() < maxLossyDctDcSize * numLossyDctChans)
{
delete _zip;
_zip = new Zip (maxLossyDctDcSize * numLossyDctChans);
}
maxOutBufferSize += _zip->maxCompressedSize();
//
// We also need to reserve space at the head of the buffer to
// write out the size of our various packed and compressed data.
//
maxOutBufferSize += NUM_SIZES_SINGLE * sizeof (Int64);
//
// Later, we're going to hijack outBuffer for the result of
// both encoding and decoding. So it needs to be big enough
// to hold either a buffers' worth of uncompressed or
// compressed data
//
// For encoding, we'll need _outBuffer to hold maxOutBufferSize bytes,
// but for decoding, we only need it to be maxScanLineSize*numScanLines.
// Cache the max size for now, and alloc the buffer when we either
// encode or decode.
//
outBufferSize = maxOutBufferSize;
//
// _packedAcBuffer holds the quantized DCT coefficients prior
// to Huffman encoding
//
if (maxLossyDctAcSize * numLossyDctChans > _packedAcBufferSize)
{
_packedAcBufferSize = maxLossyDctAcSize * numLossyDctChans;
if (_packedAcBuffer != 0)
delete[] _packedAcBuffer;
_packedAcBuffer = new char[_packedAcBufferSize];
}
//
// _packedDcBuffer holds one quantized DCT coef per 8x8 block
//
if (maxLossyDctDcSize * numLossyDctChans > _packedDcBufferSize)
{
_packedDcBufferSize = maxLossyDctDcSize * numLossyDctChans;
if (_packedDcBuffer != 0)
delete[] _packedDcBuffer;
_packedDcBuffer = new char[_packedDcBufferSize];
}
if (rleBufferSize > _rleBufferSize)
{
_rleBufferSize = rleBufferSize;
if (_rleBuffer != 0)
delete[] _rleBuffer;
_rleBuffer = new char[rleBufferSize];
}
//
// The planar uncompressed buffer will hold float data for LOSSY_DCT
// compressed values, and whatever the native type is for other
// channels. We're going to use this to hold data in a planar
// format, as opposed to the native interleaved format we take
// into compress() and give back from uncompress().
//
// This also makes it easier to compress the UNKNOWN and RLE data
// all in one swoop (for each compression scheme).
//
int planarUncBufferSize[NUM_COMPRESSOR_SCHEMES];
for (int i=0; i<NUM_COMPRESSOR_SCHEMES; ++i)
planarUncBufferSize[i] = 0;
for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
{
switch (_channelData[chan].compression)
{
case LOSSY_DCT:
break;
case RLE:
planarUncBufferSize[RLE] +=
numScanLines() * (_max[0] - _min[0] + 1) *
Imf::pixelTypeSize (_channelData[chan].type);
break;
case UNKNOWN:
planarUncBufferSize[UNKNOWN] +=
numScanLines() * (_max[0] - _min[0] + 1) *
Imf::pixelTypeSize (_channelData[chan].type);
break;
default:
throw Iex::NoImplExc ("Unhandled compression scheme case");
break;
}
}
//
// UNKNOWN data is going to be zlib compressed, which needs
// a little extra headroom
//
if (planarUncBufferSize[UNKNOWN] > 0)
{
planarUncBufferSize[UNKNOWN] =
compressBound ((uLongf)planarUncBufferSize[UNKNOWN]);
}
for (int i = 0; i < NUM_COMPRESSOR_SCHEMES; ++i)
{
if (planarUncBufferSize[i] > _planarUncBufferSize[i])
{
_planarUncBufferSize[i] = planarUncBufferSize[i];
if (_planarUncBuffer[i] != 0)
delete[] _planarUncBuffer[i];
_planarUncBuffer[i] = new char[planarUncBufferSize[i]];
}
}
}
//
// Setup channel classification rules to use when writing files
//
void
DwaCompressor::initializeDefaultChannelRules ()
{
_channelRules.clear();
_channelRules.push_back (Classifier ("R", LOSSY_DCT, HALF, 0, false));
_channelRules.push_back (Classifier ("R", LOSSY_DCT, FLOAT, 0, false));
_channelRules.push_back (Classifier ("G", LOSSY_DCT, HALF, 1, false));
_channelRules.push_back (Classifier ("G", LOSSY_DCT, FLOAT, 1, false));
_channelRules.push_back (Classifier ("B", LOSSY_DCT, HALF, 2, false));
_channelRules.push_back (Classifier ("B", LOSSY_DCT, FLOAT, 2, false));
_channelRules.push_back (Classifier ("Y", LOSSY_DCT, HALF, -1, false));
_channelRules.push_back (Classifier ("Y", LOSSY_DCT, FLOAT, -1, false));
_channelRules.push_back (Classifier ("BY", LOSSY_DCT, HALF, -1, false));
_channelRules.push_back (Classifier ("BY", LOSSY_DCT, FLOAT, -1, false));
_channelRules.push_back (Classifier ("RY", LOSSY_DCT, HALF, -1, false));
_channelRules.push_back (Classifier ("RY", LOSSY_DCT, FLOAT, -1, false));
_channelRules.push_back (Classifier ("A", RLE, UINT, -1, false));
_channelRules.push_back (Classifier ("A", RLE, HALF, -1, false));
_channelRules.push_back (Classifier ("A", RLE, FLOAT, -1, false));
}
//
// Setup channel classification rules when reading files with VERSION < 2
//
void
DwaCompressor::initializeLegacyChannelRules ()
{
_channelRules.clear();
_channelRules.push_back (Classifier ("r", LOSSY_DCT, HALF, 0, true));
_channelRules.push_back (Classifier ("r", LOSSY_DCT, FLOAT, 0, true));
_channelRules.push_back (Classifier ("red", LOSSY_DCT, HALF, 0, true));
_channelRules.push_back (Classifier ("red", LOSSY_DCT, FLOAT, 0, true));
_channelRules.push_back (Classifier ("g", LOSSY_DCT, HALF, 1, true));
_channelRules.push_back (Classifier ("g", LOSSY_DCT, FLOAT, 1, true));
_channelRules.push_back (Classifier ("grn", LOSSY_DCT, HALF, 1, true));
_channelRules.push_back (Classifier ("grn", LOSSY_DCT, FLOAT, 1, true));
_channelRules.push_back (Classifier ("green", LOSSY_DCT, HALF, 1, true));
_channelRules.push_back (Classifier ("green", LOSSY_DCT, FLOAT, 1, true));
_channelRules.push_back (Classifier ("b", LOSSY_DCT, HALF, 2, true));
_channelRules.push_back (Classifier ("b", LOSSY_DCT, FLOAT, 2, true));
_channelRules.push_back (Classifier ("blu", LOSSY_DCT, HALF, 2, true));
_channelRules.push_back (Classifier ("blu", LOSSY_DCT, FLOAT, 2, true));
_channelRules.push_back (Classifier ("blue", LOSSY_DCT, HALF, 2, true));
_channelRules.push_back (Classifier ("blue", LOSSY_DCT, FLOAT, 2, true));
_channelRules.push_back (Classifier ("y", LOSSY_DCT, HALF, -1, true));
_channelRules.push_back (Classifier ("y", LOSSY_DCT, FLOAT, -1, true));
_channelRules.push_back (Classifier ("by", LOSSY_DCT, HALF, -1, true));
_channelRules.push_back (Classifier ("by", LOSSY_DCT, FLOAT, -1, true));
_channelRules.push_back (Classifier ("ry", LOSSY_DCT, HALF, -1, true));
_channelRules.push_back (Classifier ("ry", LOSSY_DCT, FLOAT, -1, true));
_channelRules.push_back (Classifier ("a", RLE, UINT, -1, true));
_channelRules.push_back (Classifier ("a", RLE, HALF, -1, true));
_channelRules.push_back (Classifier ("a", RLE, FLOAT, -1, true));
}
//
// Given a set of rules and ChannelData, figure out which rules apply
//
void
DwaCompressor::relevantChannelRules (std::vector<Classifier> &rules) const
{
rules.clear();
std::vector<std::string> suffixes;
for (size_t cd = 0; cd < _channelData.size(); ++cd)
{
std::string suffix = _channelData[cd].name;
size_t lastDot = suffix.find_last_of ('.');
if (lastDot != std::string::npos)
suffix = suffix.substr (lastDot+1, std::string::npos);
suffixes.push_back(suffix);
}
for (size_t i = 0; i < _channelRules.size(); ++i)
{
for (size_t cd = 0; cd < _channelData.size(); ++cd)
{
if (_channelRules[i].match (suffixes[cd], _channelData[cd].type ))
{
rules.push_back (_channelRules[i]);
break;
}
}
}
}
//
// Take our initial list of channels, and cache the contents.
//
// Determine approprate compression schemes for each channel,
// and figure out which sets should potentially be CSC'ed
// prior to lossy compression.
//
void
DwaCompressor::classifyChannels
(ChannelList channels,
std::vector<ChannelData> &chanData,
std::vector<CscChannelSet> &cscData)
{
//
// prefixMap used to map channel name prefixes to
// potential CSC-able sets of channels.
//
std::map<std::string, DwaCompressor::CscChannelSet> prefixMap;
std::vector<DwaCompressor::CscChannelSet> tmpCscSet;
unsigned int numChan = 0;
for (ChannelList::Iterator c = channels.begin(); c != channels.end(); ++c)
numChan++;
if (numChan)
chanData.resize (numChan);
//
// Cache the relevant data from the channel structs.
//
unsigned int offset = 0;
for (ChannelList::Iterator c = channels.begin(); c != channels.end(); ++c)
{
chanData[offset].name = std::string (c.name());
chanData[offset].compression = UNKNOWN;
chanData[offset].xSampling = c.channel().xSampling;
chanData[offset].ySampling = c.channel().ySampling;
chanData[offset].type = c.channel().type;
chanData[offset].pLinear = c.channel().pLinear;
offset++;
}
//
// Try and figure out which channels should be
// compressed by which means.
//
for (offset = 0; offset<numChan; ++offset)
{
std::string prefix = "";
std::string suffix = chanData[offset].name;
size_t lastDot = suffix.find_last_of ('.');
if (lastDot != std::string::npos)
{
prefix = suffix.substr (0, lastDot);
suffix = suffix.substr (lastDot+1, std::string::npos);
}
//
// Make sure we have an entry in our CSC set map
//
std::map<std::string, DwaCompressor::CscChannelSet>::iterator
theSet = prefixMap.find (prefix);
if (theSet == prefixMap.end())
{
DwaCompressor::CscChannelSet tmpSet;
tmpSet.idx[0] =
tmpSet.idx[1] =
tmpSet.idx[2] = -1;
prefixMap[prefix] = tmpSet;
}
//
// Check the suffix against the list of classifications
// we defined previously. If the _cscIdx is not negative,
// it indicates that we should be part of a CSC group.
//
for (std::vector<Classifier>::iterator i = _channelRules.begin();
i != _channelRules.end();
++i)
{
if ( i->match(suffix, chanData[offset].type) )
{
chanData[offset].compression = i->_scheme;
if ( i->_cscIdx >= 0)
prefixMap[prefix].idx[i->_cscIdx] = offset;
}
}
}
//
// Finally, try and find RGB sets of channels which
// can be CSC'ed to a Y'CbCr space prior to loss, for
// better compression.
//
// Walk over our set of candidates, and see who has
// all three channels defined (and has common sampling
// patterns, etc).
//
for (std::map<std::string, DwaCompressor::CscChannelSet>::iterator
theItem = prefixMap.begin(); theItem != prefixMap.end();
++theItem)
{
int red = (*theItem).second.idx[0];
int grn = (*theItem).second.idx[1];
int blu = (*theItem).second.idx[2];
if ((red < 0) || (grn < 0) || (blu < 0))
continue;
if ((chanData[red].xSampling != chanData[grn].xSampling) ||
(chanData[red].xSampling != chanData[blu].xSampling) ||
(chanData[grn].xSampling != chanData[blu].xSampling) ||
(chanData[red].ySampling != chanData[grn].ySampling) ||
(chanData[red].ySampling != chanData[blu].ySampling) ||
(chanData[grn].ySampling != chanData[blu].ySampling))
{
continue;
}
tmpCscSet.push_back ((*theItem).second);
}
size_t numCsc = tmpCscSet.size();
if (numCsc)
cscData.resize(numCsc);
for (offset = 0; offset < numCsc; ++offset)
cscData[offset] = tmpCscSet[offset];
}
//
// Setup some buffer pointers, determine channel sizes, things
// like that.
//
void
DwaCompressor::setupChannelData (int minX, int minY, int maxX, int maxY)
{
char *planarUncBuffer[NUM_COMPRESSOR_SCHEMES];
for (int i=0; i<NUM_COMPRESSOR_SCHEMES; ++i)
{
planarUncBuffer[i] = 0;
if (_planarUncBuffer[i])
planarUncBuffer[i] = _planarUncBuffer[i];
}
for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
{
ChannelData *cd = &_channelData[chan];
cd->width = Imf::numSamples (cd->xSampling, minX, maxX);
cd->height = Imf::numSamples (cd->ySampling, minY, maxY);
cd->planarUncSize =
cd->width * cd->height * Imf::pixelTypeSize (cd->type);
cd->planarUncBuffer = planarUncBuffer[cd->compression];
cd->planarUncBufferEnd = cd->planarUncBuffer;
cd->planarUncRle[0] = cd->planarUncBuffer;
cd->planarUncRleEnd[0] = cd->planarUncRle[0];
for (int byte = 1; byte < Imf::pixelTypeSize(cd->type); ++byte)
{
cd->planarUncRle[byte] =
cd->planarUncRle[byte-1] + cd->width * cd->height;
cd->planarUncRleEnd[byte] =
cd->planarUncRle[byte];
}
cd->planarUncType = cd->type;
if (cd->compression == LOSSY_DCT)
{
cd->planarUncType = FLOAT;
}
else
{
planarUncBuffer[cd->compression] +=
cd->width * cd->height * Imf::pixelTypeSize (cd->planarUncType);
}
}
}
OPENEXR_IMF_INTERNAL_NAMESPACE_SOURCE_EXIT