squish: Update to upstream 1.14

Sources are untouched, tarball from https://sourceforge.net/projects/libsquish (cherry picked from commit 249836e530)
2016-10-13 21:52:16 +02:00 · 2016-10-13 21:52:16 +02:00 · 8263fca121
commit 8263fca121
parent 1022705707
25 changed files with 3912 additions and 3643 deletions
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@ -150,9 +150,8 @@ Files extracted from upstream source:

 ## squish

- Upstream: https://code.google.com/archive/p/libsquish
-  and patches from https://github.com/Cavewhere/squish
- Version: 1.11
+- Upstream: https://sourceforge.net/projects/libsquish
+- Version: 1.14
 - License: MIT

 Files extracted from upstream source:
--- a/thirdparty/squish/alpha.cpp
+++ b/thirdparty/squish/alpha.cpp
@ -24,6 +24,7 @@
   -------------------------------------------------------------------------- */

 #include "alpha.h"
+
 #include <climits>
 #include <algorithm>

--- a/thirdparty/squish/alpha.h
+++ b/thirdparty/squish/alpha.h
@ -26,7 +26,7 @@
 #ifndef SQUISH_ALPHA_H
 #define SQUISH_ALPHA_H

-#include <squish.h>
+#include "squish.h"

 namespace squish {

--- a/thirdparty/squish/clusterfit.cpp
+++ b/thirdparty/squish/clusterfit.cpp
@ -31,22 +31,21 @@

 namespace squish {

-ClusterFit::ClusterFit( ColourSet const* colours, int flags ) 
+ClusterFit::ClusterFit( ColourSet const* colours, int flags, float* metric )
  : ColourFit( colours, flags )
 {
    // set the iteration count
    m_iterationCount = ( m_flags & kColourIterativeClusterFit ) ? kMaxIterations : 1;

-	// initialise the best error
-	m_besterror = VEC4_CONST( FLT_MAX );
-
-	// initialise the metric
-	bool perceptual = ( ( m_flags & kColourMetricPerceptual ) != 0 );
-	if( perceptual )
-		m_metric = Vec4( 0.2126f, 0.7152f, 0.0722f, 0.0f );
+    // initialise the metric (old perceptual = 0.2126f, 0.7152f, 0.0722f)
+    if( metric )
+        m_metric = Vec4( metric[0], metric[1], metric[2], 1.0f );
    else
        m_metric = VEC4_CONST( 1.0f );

+    // initialise the best error
+    m_besterror = VEC4_CONST( FLT_MAX );
+
    // cache some values
    int const count = m_colours->GetCount();
    Vec3 const* values = m_colours->GetPoints();
--- a/thirdparty/squish/clusterfit.h
+++ b/thirdparty/squish/clusterfit.h
@ -27,7 +27,7 @@
 #ifndef SQUISH_CLUSTERFIT_H
 #define SQUISH_CLUSTERFIT_H

-#include <squish.h>
+#include "squish.h"
 #include "maths.h"
 #include "simd.h"
 #include "colourfit.h"
@ -37,7 +37,7 @@ namespace squish {
 class ClusterFit : public ColourFit
 {
 public:
-	ClusterFit( ColourSet const* colours, int flags );
+    ClusterFit( ColourSet const* colours, int flags, float* metric );

 private:
    bool ConstructOrdering( Vec3 const& axis, int iteration );
--- a/thirdparty/squish/colourblock.h
+++ b/thirdparty/squish/colourblock.h
@ -26,7 +26,7 @@
 #ifndef SQUISH_COLOURBLOCK_H
 #define SQUISH_COLOURBLOCK_H

-#include <squish.h>
+#include "squish.h"
 #include "maths.h"

 namespace squish {
--- a/thirdparty/squish/colourfit.cpp
+++ b/thirdparty/squish/colourfit.cpp
@ -34,6 +34,10 @@ ColourFit::ColourFit( ColourSet const* colours, int flags )
 {
 }

+ColourFit::~ColourFit()
+{
+}
+
 void ColourFit::Compress( void* block )
 {
    bool isDxt1 = ( ( m_flags & kDxt1 ) != 0 );
--- a/thirdparty/squish/colourfit.h
+++ b/thirdparty/squish/colourfit.h
@ -26,9 +26,11 @@
 #ifndef SQUISH_COLOURFIT_H
 #define SQUISH_COLOURFIT_H

-#include <squish.h>
+#include "squish.h"
 #include "maths.h"

+#include <climits>
+
 namespace squish {

 class ColourSet;
@ -37,6 +39,7 @@ class ColourFit
 {
 public:
    ColourFit( ColourSet const* colours, int flags );
+    virtual ~ColourFit();

    void Compress( void* block );

--- a/thirdparty/squish/colourset.h
+++ b/thirdparty/squish/colourset.h
@ -26,7 +26,7 @@
 #ifndef SQUISH_COLOURSET_H
 #define SQUISH_COLOURSET_H

-#include <squish.h>
+#include "squish.h"
 #include "maths.h"

 namespace squish {
--- a/thirdparty/squish/config.h
+++ b/thirdparty/squish/config.h
@ -36,7 +36,7 @@
 #define SQUISH_USE_SSE 0
 #endif

-// Internally et SQUISH_USE_SIMD when either Altivec or SSE is available.
+// Internally set SQUISH_USE_SIMD when either Altivec or SSE is available.
 #if SQUISH_USE_ALTIVEC && SQUISH_USE_SSE
 #error "Cannot enable both Altivec and SSE!"
 #endif
--- a/thirdparty/squish/maths.cpp
+++ b/thirdparty/squish/maths.cpp
@ -30,6 +30,7 @@
 */

 #include "maths.h"
+#include "simd.h"
 #include <cfloat>

 namespace squish {
@ -44,6 +45,7 @@ Sym3x3 ComputeWeightedCovariance( int n, Vec3 const* points, float const* weight
        total += weights[i];
        centroid += weights[i]*points[i];
    }
+    if( total > FLT_EPSILON )
        centroid /= total;

    // accumulate the covariance matrix
@ -65,6 +67,8 @@ Sym3x3 ComputeWeightedCovariance( int n, Vec3 const* points, float const* weight
    return covariance;
 }

+#if 0
+
 static Vec3 GetMultiplicity1Evector( Sym3x3 const& matrix, float evalue )
 {
    // compute M
@ -224,4 +228,32 @@ Vec3 ComputePrincipleComponent( Sym3x3 const& matrix )
    }
 }

+#else
+
+#define POWER_ITERATION_COUNT    8
+
+Vec3 ComputePrincipleComponent( Sym3x3 const& matrix )
+{
+    Vec4 const row0( matrix[0], matrix[1], matrix[2], 0.0f );
+    Vec4 const row1( matrix[1], matrix[3], matrix[4], 0.0f );
+    Vec4 const row2( matrix[2], matrix[4], matrix[5], 0.0f );
+    Vec4 v = VEC4_CONST( 1.0f );
+    for( int i = 0; i < POWER_ITERATION_COUNT; ++i )
+    {
+        // matrix multiply
+        Vec4 w = row0*v.SplatX();
+        w = MultiplyAdd(row1, v.SplatY(), w);
+        w = MultiplyAdd(row2, v.SplatZ(), w);
+
+        // get max component from xyz in all channels
+        Vec4 a = Max(w.SplatX(), Max(w.SplatY(), w.SplatZ()));
+
+        // divide through and advance
+        v = w*Reciprocal(a);
+    }
+    return v.GetVec3();
+}
+
+#endif
+
 } // namespace squish
--- a/thirdparty/squish/rangefit.cpp
+++ b/thirdparty/squish/rangefit.cpp
@ -30,13 +30,12 @@

 namespace squish {

-RangeFit::RangeFit( ColourSet const* colours, int flags ) 
+RangeFit::RangeFit( ColourSet const* colours, int flags, float* metric )
  : ColourFit( colours, flags )
 {
-	// initialise the metric
-	bool perceptual = ( ( m_flags & kColourMetricPerceptual ) != 0 );
-	if( perceptual )
-		m_metric = Vec3( 0.2126f, 0.7152f, 0.0722f );
+    // initialise the metric (old perceptual = 0.2126f, 0.7152f, 0.0722f)
+    if( metric )
+        m_metric = Vec3( metric[0], metric[1], metric[2] );
    else
        m_metric = Vec3( 1.0f );

--- a/thirdparty/squish/rangefit.h
+++ b/thirdparty/squish/rangefit.h
@ -26,7 +26,7 @@
 #ifndef SQUISH_RANGEFIT_H
 #define SQUISH_RANGEFIT_H

-#include <squish.h>
+#include "squish.h"
 #include "colourfit.h"
 #include "maths.h"

@ -37,7 +37,7 @@ class ColourSet;
 class RangeFit : public ColourFit
 {
 public:
-	RangeFit( ColourSet const* colours, int flags );
+    RangeFit( ColourSet const* colours, int flags, float* metric );

 private:
    virtual void Compress3( void* block );
--- a/thirdparty/squish/simd_ve.h
+++ b/thirdparty/squish/simd_ve.h
@ -31,7 +31,7 @@

 namespace squish {

-#define VEC4_CONST( X ) Vec4( ( vector float )( X ) )
+#define VEC4_CONST( X ) Vec4( ( vector float ){ X } )

 class Vec4
 {
@ -96,7 +96,7 @@ public:

    Vec4& operator*=( Arg v )
    {
-		m_v = vec_madd( m_v, v.m_v, ( vector float )( -0.0f ) );
+        m_v = vec_madd( m_v, v.m_v, ( vector float ){ -0.0f } );
        return *this;
    }

@ -112,7 +112,7 @@ public:

    friend Vec4 operator*( Vec4::Arg left, Vec4::Arg right  )
    {
-		return Vec4( vec_madd( left.m_v, right.m_v, ( vector float )( -0.0f ) ) );
+        return Vec4( vec_madd( left.m_v, right.m_v, ( vector float ){ -0.0f } ) );
    }

    //! Returns a*b + c
@ -133,7 +133,7 @@ public:
        vector float estimate = vec_re( v.m_v );

        // one round of Newton-Rhaphson refinement
-		vector float diff = vec_nmsub( estimate, v.m_v, ( vector float )( 1.0f ) );
+        vector float diff = vec_nmsub( estimate, v.m_v, ( vector float ){ 1.0f } );
        return Vec4( vec_madd( diff, estimate, estimate ) );
    }

--- a/thirdparty/squish/singlecolourfit.cpp
+++ b/thirdparty/squish/singlecolourfit.cpp
@ -26,7 +26,6 @@
 #include "singlecolourfit.h"
 #include "colourset.h"
 #include "colourblock.h"
-#include <climits>

 namespace squish {

--- a/thirdparty/squish/singlecolourfit.h
+++ b/thirdparty/squish/singlecolourfit.h
@ -26,7 +26,7 @@
 #ifndef SQUISH_SINGLECOLOURFIT_H
 #define SQUISH_SINGLECOLOURFIT_H

-#include <squish.h>
+#include "squish.h"
 #include "colourfit.h"

 namespace squish {
--- a/thirdparty/squish/singlecolourlookup.inl
+++ b/thirdparty/squish/singlecolourlookup.inl
@ -1,3 +1,27 @@
+/* -----------------------------------------------------------------------------
+
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+   -------------------------------------------------------------------------- */

 static SingleColourLookup const lookup_5_3[] =
 {
--- a/thirdparty/squish/squish.cpp
+++ b/thirdparty/squish/squish.cpp
@ -23,7 +23,8 @@

   -------------------------------------------------------------------------- */

-#include <squish.h>
+#include <string.h>
+#include "squish.h"
 #include "colourset.h"
 #include "maths.h"
 #include "rangefit.h"
@ -37,37 +38,58 @@ namespace squish {
 static int FixFlags( int flags )
 {
    // grab the flag bits
-	int method = flags & ( kDxt1 | kDxt3 | kDxt5 );
+    int method = flags & ( kDxt1 | kDxt3 | kDxt5 | kBc4 | kBc5 );
    int fit = flags & ( kColourIterativeClusterFit | kColourClusterFit | kColourRangeFit );
-	int metric = flags & ( kColourMetricPerceptual | kColourMetricUniform );
    int extra = flags & kWeightColourByAlpha;

    // set defaults
-	if( method != kDxt3 && method != kDxt5 )
+    if ( method != kDxt3
+    &&   method != kDxt5
+    &&   method != kBc4
+    &&   method != kBc5 )
+    {
        method = kDxt1;
-	if( fit != kColourRangeFit )
+    }
+    if( fit != kColourRangeFit && fit != kColourIterativeClusterFit )
        fit = kColourClusterFit;
-	if( metric != kColourMetricUniform )
-		metric = kColourMetricPerceptual;

    // done
-	return method | fit | metric | extra;
+    return method | fit | extra;
 }

-void Compress( u8 const* rgba, void* block, int flags )
-{
-	// compress with full mask
-	CompressMasked( rgba, 0xffff, block, flags );
-}
-
-void CompressMasked( u8 const* rgba, int mask, void* block, int flags )
+void CompressMasked( u8 const* rgba, int mask, void* block, int flags, float* metric )
 {
    // fix any bad flags
    flags = FixFlags( flags );

+    if ( ( flags & ( kBc4 | kBc5 ) ) != 0 )
+    {
+        u8 alpha[16*4];
+        for( int i = 0; i < 16; ++i )
+        {
+            alpha[i*4 + 3] = rgba[i*4 + 0]; // copy R to A
+        }
+
+        u8* rBlock = reinterpret_cast< u8* >( block );
+        CompressAlphaDxt5( alpha, mask, rBlock );
+
+        if ( ( flags & ( kBc5 ) ) != 0 )
+        {
+            for( int i = 0; i < 16; ++i )
+            {
+                alpha[i*4 + 3] = rgba[i*4 + 1]; // copy G to A
+            }
+
+            u8* gBlock = reinterpret_cast< u8* >( block ) + 8;
+            CompressAlphaDxt5( alpha, mask, gBlock );
+        }
+
+        return;
+    }
+
    // get the block locations
    void* colourBlock = block;
-	void* alphaBock = block;
+    void* alphaBlock = block;
    if( ( flags & ( kDxt3 | kDxt5 ) ) != 0 )
        colourBlock = reinterpret_cast< u8* >( block ) + 8;

@ -84,21 +106,21 @@ void CompressMasked( u8 const* rgba, int mask, void* block, int flags )
    else if( ( flags & kColourRangeFit ) != 0 || colours.GetCount() == 0 )
    {
        // do a range fit
-		RangeFit fit( &colours, flags );
+        RangeFit fit( &colours, flags, metric );
        fit.Compress( colourBlock );
    }
    else
    {
        // default to a cluster fit (could be iterative or not)
-		ClusterFit fit( &colours, flags );
+        ClusterFit fit( &colours, flags, metric );
        fit.Compress( colourBlock );
    }

    // compress alpha separately if necessary
    if( ( flags & kDxt3 ) != 0 )
-		CompressAlphaDxt3( rgba, mask, alphaBock );
+        CompressAlphaDxt3( rgba, mask, alphaBlock );
    else if( ( flags & kDxt5 ) != 0 )
-		CompressAlphaDxt5( rgba, mask, alphaBock );
+        CompressAlphaDxt5( rgba, mask, alphaBlock );
 }

 void Decompress( u8* rgba, void const* block, int flags )
@ -108,7 +130,7 @@ void Decompress( u8* rgba, void const* block, int flags )

    // get the block locations
    void const* colourBlock = block;
-	void const* alphaBock = block;
+    void const* alphaBlock = block;
    if( ( flags & ( kDxt3 | kDxt5 ) ) != 0 )
        colourBlock = reinterpret_cast< u8 const* >( block ) + 8;

@ -117,9 +139,9 @@ void Decompress( u8* rgba, void const* block, int flags )

    // decompress alpha separately if necessary
    if( ( flags & kDxt3 ) != 0 )
-		DecompressAlphaDxt3( rgba, alphaBock );
+        DecompressAlphaDxt3( rgba, alphaBlock );
    else if( ( flags & kDxt5 ) != 0 )
-		DecompressAlphaDxt5( rgba, alphaBock );
+        DecompressAlphaDxt5( rgba, alphaBlock );
 }

 int GetStorageRequirements( int width, int height, int flags )
@ -129,18 +151,35 @@ int GetStorageRequirements( int width, int height, int flags )

    // compute the storage requirements
    int blockcount = ( ( width + 3 )/4 ) * ( ( height + 3 )/4 );
-	int blocksize = ( ( flags & kDxt1 ) != 0 ) ? 8 : 16;
+    int blocksize = ( ( flags & ( kDxt1 | kBc4 ) ) != 0 ) ? 8 : 16;
    return blockcount*blocksize;
 }

-void CompressImage( u8 const* rgba, int width, int height, void* blocks, int flags )
+void CopyRGBA( u8 const* source, u8* dest, int flags )
+{
+    if (flags & kSourceBGRA)
+    {
+        // convert from bgra to rgba
+        dest[0] = source[2];
+        dest[1] = source[1];
+        dest[2] = source[0];
+        dest[3] = source[3];
+    }
+    else
+    {
+        for( int i = 0; i < 4; ++i )
+            *dest++ = *source++;
+    }
+}
+
+void CompressImage( u8 const* rgba, int width, int height, int pitch, void* blocks, int flags, float* metric )
 {
    // fix any bad flags
    flags = FixFlags( flags );

    // initialise the block output
    u8* targetBlock = reinterpret_cast< u8* >( blocks );
-	int bytesPerBlock = ( ( flags & kDxt1 ) != 0 ) ? 8 : 16;
+    int bytesPerBlock = ( ( flags & ( kDxt1 | kBc4 ) ) != 0 ) ? 8 : 16;

    // loop over blocks
    for( int y = 0; y < height; y += 4 )
@ -163,23 +202,19 @@ void CompressImage( u8 const* rgba, int width, int height, void* blocks, int fla
                    if( sx < width && sy < height )
                    {
                        // copy the rgba value
-						u8 const* sourcePixel = rgba + 4*( width*sy + sx );
-						for( int i = 0; i < 4; ++i )
-							*targetPixel++ = *sourcePixel++;
-							
+                        u8 const* sourcePixel = rgba + pitch*sy + 4*sx;
+                        CopyRGBA(sourcePixel, targetPixel, flags);
                        // enable this pixel
                        mask |= ( 1 << ( 4*py + px ) );
                    }
-					else
-					{
-						// skip this pixel as its outside the image
+
+                    // advance to the next pixel
                    targetPixel += 4;
                }
            }
-			}

            // compress it into the output
-			CompressMasked( sourceRgba, mask, targetBlock, flags );
+            CompressMasked( sourceRgba, mask, targetBlock, flags, metric );

            // advance
            targetBlock += bytesPerBlock;
@ -187,14 +222,19 @@ void CompressImage( u8 const* rgba, int width, int height, void* blocks, int fla
    }
 }

-void DecompressImage( u8* rgba, int width, int height, void const* blocks, int flags )
+void CompressImage( u8 const* rgba, int width, int height, void* blocks, int flags, float* metric )
+{
+    CompressImage(rgba, width, height, width*4, blocks, flags, metric);
+}
+
+void DecompressImage( u8* rgba, int width, int height, int pitch, void const* blocks, int flags )
 {
    // fix any bad flags
    flags = FixFlags( flags );

    // initialise the block input
    u8 const* sourceBlock = reinterpret_cast< u8 const* >( blocks );
-	int bytesPerBlock = ( ( flags & kDxt1 ) != 0 ) ? 8 : 16;
+    int bytesPerBlock = ( ( flags & ( kDxt1 | kBc4 ) ) != 0 ) ? 8 : 16;

    // loop over blocks
    for( int y = 0; y < height; y += 4 )
@ -214,21 +254,19 @@ void DecompressImage( u8* rgba, int width, int height, void const* blocks, int f
                    // get the target location
                    int sx = x + px;
                    int sy = y + py;
+
+                    // write if we're in the image
                    if( sx < width && sy < height )
                    {
-						u8* targetPixel = rgba + 4*( width*sy + sx );
-						
                        // copy the rgba value
-						for( int i = 0; i < 4; ++i )
-							*targetPixel++ = *sourcePixel++;
+                        u8* targetPixel = rgba + pitch*sy + 4*sx;
+                        CopyRGBA(sourcePixel, targetPixel, flags);
                    }
-					else
-					{
-						// skip this pixel as its outside the image
+
+                    // advance to the next pixel
                    sourcePixel += 4;
                }
            }
-			}

            // advance
            sourceBlock += bytesPerBlock;
@ -236,4 +274,122 @@ void DecompressImage( u8* rgba, int width, int height, void const* blocks, int f
    }
 }

+void DecompressImage( u8* rgba, int width, int height, void const* blocks, int flags )
+{
+    DecompressImage( rgba, width, height, width*4, blocks, flags );
+}
+
+static double ErrorSq(double x, double y)
+{
+    return (x - y) * (x - y);
+}
+
+static void ComputeBlockWMSE(u8 const *original, u8 const *compressed, unsigned int w, unsigned int h, double &cmse, double &amse)
+{
+    // Computes the MSE for the block and weights it by the variance of the original block.
+    // If the variance of the original block is less than 4 (i.e. a standard deviation of 1 per channel)
+    // then the block is close to being a single colour. Quantisation errors in single colour blocks
+    // are easier to see than similar errors in blocks that contain more colours, particularly when there
+    // are many such blocks in a large area (eg a blue sky background) as they cause banding.  Given that
+    // banding is easier to see than small errors in "complex" blocks, we weight the errors by a factor
+    // of 5. This implies that images with large, single colour areas will have a higher potential WMSE
+    // than images with lots of detail.
+
+    cmse = amse = 0;
+    unsigned int sum_p[4];  // per channel sum of pixels
+    unsigned int sum_p2[4]; // per channel sum of pixels squared
+    memset(sum_p, 0, sizeof(sum_p));
+    memset(sum_p2, 0, sizeof(sum_p2));
+    for( unsigned int py = 0; py < 4; ++py )
+    {
+        for( unsigned int px = 0; px < 4; ++px )
+        {
+            if( px < w && py < h )
+            {
+                double pixelCMSE = 0;
+                for( int i = 0; i < 3; ++i )
+                {
+                    pixelCMSE += ErrorSq(original[i], compressed[i]);
+                    sum_p[i] += original[i];
+                    sum_p2[i] += (unsigned int)original[i]*original[i];
+                }
+                if( original[3] == 0 && compressed[3] == 0 )
+                    pixelCMSE = 0; // transparent in both, so colour is inconsequential
+                amse += ErrorSq(original[3], compressed[3]);
+                cmse += pixelCMSE;
+                sum_p[3] += original[3];
+                sum_p2[3] += (unsigned int)original[3]*original[3];
+            }
+            original += 4;
+            compressed += 4;
+        }
+    }
+    unsigned int variance = 0;
+    for( int i = 0; i < 4; ++i )
+        variance += w*h*sum_p2[i] - sum_p[i]*sum_p[i];
+    if( variance < 4 * w * w * h * h )
+    {
+        amse *= 5;
+        cmse *= 5;
+    }
+}
+
+void ComputeMSE( u8 const *rgba, int width, int height, int pitch, u8 const *dxt, int flags, double &colourMSE, double &alphaMSE )
+{
+    // fix any bad flags
+    flags = FixFlags( flags );
+    colourMSE = alphaMSE = 0;
+
+    // initialise the block input
+    squish::u8 const* sourceBlock = dxt;
+    int bytesPerBlock = ( ( flags & squish::kDxt1 ) != 0 ) ? 8 : 16;
+
+    // loop over blocks
+    for( int y = 0; y < height; y += 4 )
+    {
+        for( int x = 0; x < width; x += 4 )
+        {
+            // decompress the block
+            u8 targetRgba[4*16];
+            Decompress( targetRgba, sourceBlock, flags );
+            u8 const* sourcePixel = targetRgba;
+
+            // copy across to a similar pixel block
+            u8 originalRgba[4*16];
+            u8* originalPixel = originalRgba;
+
+            for( int py = 0; py < 4; ++py )
+            {
+                for( int px = 0; px < 4; ++px )
+                {
+                    int sx = x + px;
+                    int sy = y + py;
+                    if( sx < width && sy < height )
+                    {
+                        u8 const* targetPixel = rgba + pitch*sy + 4*sx;
+                        CopyRGBA(targetPixel, originalPixel, flags);
+                    }
+                    sourcePixel += 4;
+                    originalPixel += 4;
+                }
+            }
+
+            // compute the weighted MSE of the block
+            double blockCMSE, blockAMSE;
+            ComputeBlockWMSE(originalRgba, targetRgba, std::min(4, width - x), std::min(4, height - y), blockCMSE, blockAMSE);
+            colourMSE += blockCMSE;
+            alphaMSE += blockAMSE;
+            // advance
+            sourceBlock += bytesPerBlock;
+        }
+    }
+    colourMSE /= (width * height * 3);
+    alphaMSE /= (width * height);
+}
+
+void ComputeMSE( u8 const *rgba, int width, int height, u8 const *dxt, int flags, double &colourMSE, double &alphaMSE )
+{
+    ComputeMSE(rgba, width, height, width*4, dxt, flags, colourMSE, alphaMSE);
+}
+
 } // namespace squish
--- a/thirdparty/squish/squish.h
+++ b/thirdparty/squish/squish.h
@ -47,66 +47,37 @@ enum
    //! Use DXT5 compression.
    kDxt5 = ( 1 << 2 ),

+    //! Use BC4 compression.
+    kBc4 = ( 1 << 3 ),
+
+    //! Use BC5 compression.
+    kBc5 = ( 1 << 4 ),
+
+    //! Use a slow but high quality colour compressor (the default).
+    kColourClusterFit = ( 1 << 5 ),
+
+    //! Use a fast but low quality colour compressor.
+    kColourRangeFit = ( 1 << 6 ),
+
+    //! Weight the colour by alpha during cluster fit (disabled by default).
+    kWeightColourByAlpha = ( 1 << 7 ),
+
    //! Use a very slow but very high quality colour compressor.
    kColourIterativeClusterFit = ( 1 << 8 ),

-	//! Use a slow but high quality colour compressor (the default).
-	kColourClusterFit = ( 1 << 3 ),	
-	
-	//! Use a fast but low quality colour compressor.
-	kColourRangeFit	= ( 1 << 4 ),
-	
-	//! Use a perceptual metric for colour error (the default).
-	kColourMetricPerceptual = ( 1 << 5 ),
-
-	//! Use a uniform metric for colour error.
-	kColourMetricUniform = ( 1 << 6 ),
-	
-	//! Weight the colour by alpha during cluster fit (disabled by default).
-	kWeightColourByAlpha = ( 1 << 7 )
+    //! Source is BGRA rather than RGBA
+    kSourceBGRA = ( 1 << 9 )
 };

 // -----------------------------------------------------------------------------

-/*! @brief Compresses a 4x4 block of pixels.
-
-	@param rgba		The rgba values of the 16 source pixels.
-	@param block	Storage for the compressed DXT block.
-	@param flags	Compression flags.
-	
-	The source pixels should be presented as a contiguous array of 16 rgba
-	values, with each component as 1 byte each. In memory this should be:
-	
-		{ r1, g1, b1, a1, .... , r16, g16, b16, a16 }
-	
-	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
-	however, DXT1 will be used by default if none is specified. When using DXT1 
-	compression, 8 bytes of storage are required for the compressed DXT block. 
-	DXT3 and DXT5 compression require 16 bytes of storage per block.
-	
-	The flags parameter can also specify a preferred colour compressor and 
-	colour error metric to use when fitting the RGB components of the data. 
-	Possible colour compressors are: kColourClusterFit (the default), 
-	kColourRangeFit or kColourIterativeClusterFit. Possible colour error metrics 
-	are: kColourMetricPerceptual (the default) or kColourMetricUniform. If no 
-	flags are specified in any particular category then the default will be 
-	used. Unknown flags are ignored.
-	
-	When using kColourClusterFit, an additional flag can be specified to
-	weight the colour of each pixel by its alpha value. For images that are
-	rendered using alpha blending, this can significantly increase the 
-	perceived quality.
-*/
-void Compress( u8 const* rgba, void* block, int flags );
-
-// -----------------------------------------------------------------------------
-
 /*! @brief Compresses a 4x4 block of pixels.

    @param rgba   The rgba values of the 16 source pixels.
    @param mask   The valid pixel mask.
    @param block  Storage for the compressed DXT block.
    @param flags  Compression flags.
+    @param metric An optional perceptual metric.

    The source pixels should be presented as a contiguous array of 16 rgba
    values, with each component as 1 byte each. In memory this should be:
@ -120,25 +91,73 @@ void Compress( u8 const* rgba, void* block, int flags );
    is in the CompressImage function to disable pixels outside the bounds of
    the image when the width or height is not divisible by 4.

-	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+    The flags parameter should specify kDxt1, kDxt3, kDxt5, kBc4, or kBc5 compression,
    however, DXT1 will be used by default if none is specified. When using DXT1
    compression, 8 bytes of storage are required for the compressed DXT block.
    DXT3 and DXT5 compression require 16 bytes of storage per block.

-	The flags parameter can also specify a preferred colour compressor and 
-	colour error metric to use when fitting the RGB components of the data. 
-	Possible colour compressors are: kColourClusterFit (the default), 
-	kColourRangeFit or kColourIterativeClusterFit. Possible colour error metrics 
-	are: kColourMetricPerceptual (the default) or kColourMetricUniform. If no 
-	flags are specified in any particular category then the default will be 
-	used. Unknown flags are ignored.
+    The flags parameter can also specify a preferred colour compressor to use
+    when fitting the RGB components of the data. Possible colour compressors
+    are: kColourClusterFit (the default), kColourRangeFit (very fast, low
+    quality) or kColourIterativeClusterFit (slowest, best quality).

-	When using kColourClusterFit, an additional flag can be specified to
-	weight the colour of each pixel by its alpha value. For images that are
-	rendered using alpha blending, this can significantly increase the 
-	perceived quality.
+    When using kColourClusterFit or kColourIterativeClusterFit, an additional
+    flag can be specified to weight the importance of each pixel by its alpha
+    value. For images that are rendered using alpha blending, this can
+    significantly increase the perceived quality.
+
+    The metric parameter can be used to weight the relative importance of each
+    colour channel, or pass NULL to use the default uniform weight of
+    { 1.0f, 1.0f, 1.0f }. This replaces the previous flag-based control that
+    allowed either uniform or "perceptual" weights with the fixed values
+    { 0.2126f, 0.7152f, 0.0722f }. If non-NULL, the metric should point to a
+    contiguous array of 3 floats.
 */
-void CompressMasked( u8 const* rgba, int mask, void* block, int flags );
+void CompressMasked( u8 const* rgba, int mask, void* block, int flags, float* metric = 0 );
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Compresses a 4x4 block of pixels.
+
+    @param rgba   The rgba values of the 16 source pixels.
+    @param block  Storage for the compressed DXT block.
+    @param flags  Compression flags.
+    @param metric An optional perceptual metric.
+
+    The source pixels should be presented as a contiguous array of 16 rgba
+    values, with each component as 1 byte each. In memory this should be:
+
+        { r1, g1, b1, a1, .... , r16, g16, b16, a16 }
+
+    The flags parameter should specify kDxt1, kDxt3, kDxt5, kBc4, or kBc5 compression,
+    however, DXT1 will be used by default if none is specified. When using DXT1
+    compression, 8 bytes of storage are required for the compressed DXT block.
+    DXT3 and DXT5 compression require 16 bytes of storage per block.
+
+    The flags parameter can also specify a preferred colour compressor to use
+    when fitting the RGB components of the data. Possible colour compressors
+    are: kColourClusterFit (the default), kColourRangeFit (very fast, low
+    quality) or kColourIterativeClusterFit (slowest, best quality).
+
+    When using kColourClusterFit or kColourIterativeClusterFit, an additional
+    flag can be specified to weight the importance of each pixel by its alpha
+    value. For images that are rendered using alpha blending, this can
+    significantly increase the perceived quality.
+
+    The metric parameter can be used to weight the relative importance of each
+    colour channel, or pass NULL to use the default uniform weight of
+    { 1.0f, 1.0f, 1.0f }. This replaces the previous flag-based control that
+    allowed either uniform or "perceptual" weights with the fixed values
+    { 0.2126f, 0.7152f, 0.0722f }. If non-NULL, the metric should point to a
+    contiguous array of 3 floats.
+
+    This method is an inline that calls CompressMasked with a mask of 0xffff,
+    provided for compatibility with older versions of squish.
+*/
+inline void Compress( u8 const* rgba, void* block, int flags, float* metric = 0 )
+{
+    CompressMasked( rgba, 0xffff, block, flags, metric );
+}

 // -----------------------------------------------------------------------------

@ -153,7 +172,7 @@ void CompressMasked( u8 const* rgba, int mask, void* block, int flags );

        { r1, g1, b1, a1, .... , r16, g16, b16, a16 }

-	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+    The flags parameter should specify kDxt1, kDxt3, kDxt5, kBc4, or kBc5 compression,
    however, DXT1 will be used by default if none is specified. All other flags
    are ignored.
 */
@ -167,7 +186,7 @@ void Decompress( u8* rgba, void const* block, int flags );
    @param height The height of the image.
    @param flags  Compression flags.

-	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+    The flags parameter should specify kDxt1, kDxt3, kDxt5, kBc4, or kBc5 compression,
    however, DXT1 will be used by default if none is specified. All other flags
    are ignored.

@ -184,37 +203,45 @@ int GetStorageRequirements( int width, int height, int flags );
    @param rgba   The pixels of the source.
    @param width  The width of the source image.
    @param height The height of the source image.
+    @param pitch  The pitch of the source image.
    @param blocks Storage for the compressed output.
    @param flags  Compression flags.
+    @param metric An optional perceptual metric.

    The source pixels should be presented as a contiguous array of width*height
    rgba values, with each component as 1 byte each. In memory this should be:

        { r1, g1, b1, a1, .... , rn, gn, bn, an } for n = width*height

-	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+    The flags parameter should specify kDxt1, kDxt3, kDxt5, kBc4, or kBc5 compression,
    however, DXT1 will be used by default if none is specified. When using DXT1
    compression, 8 bytes of storage are required for each compressed DXT block.
    DXT3 and DXT5 compression require 16 bytes of storage per block.

-	The flags parameter can also specify a preferred colour compressor and 
-	colour error metric to use when fitting the RGB components of the data. 
-	Possible colour compressors are: kColourClusterFit (the default), 
-	kColourRangeFit or kColourIterativeClusterFit. Possible colour error metrics 
-	are: kColourMetricPerceptual (the default) or kColourMetricUniform. If no 
-	flags are specified in any particular category then the default will be 
-	used. Unknown flags are ignored.
+    The flags parameter can also specify a preferred colour compressor to use
+    when fitting the RGB components of the data. Possible colour compressors
+    are: kColourClusterFit (the default), kColourRangeFit (very fast, low
+    quality) or kColourIterativeClusterFit (slowest, best quality).

-	When using kColourClusterFit, an additional flag can be specified to
-	weight the colour of each pixel by its alpha value. For images that are
-	rendered using alpha blending, this can significantly increase the 
-	perceived quality.
+    When using kColourClusterFit or kColourIterativeClusterFit, an additional
+    flag can be specified to weight the importance of each pixel by its alpha
+    value. For images that are rendered using alpha blending, this can
+    significantly increase the perceived quality.

-	Internally this function calls squish::Compress for each block. To see how
-	much memory is required in the compressed image, use
-	squish::GetStorageRequirements.
+    The metric parameter can be used to weight the relative importance of each
+    colour channel, or pass NULL to use the default uniform weight of
+    { 1.0f, 1.0f, 1.0f }. This replaces the previous flag-based control that
+    allowed either uniform or "perceptual" weights with the fixed values
+    { 0.2126f, 0.7152f, 0.0722f }. If non-NULL, the metric should point to a
+    contiguous array of 3 floats.
+
+    Internally this function calls squish::CompressMasked for each block, which
+    allows for pixels outside the image to take arbitrary values. The function
+    squish::GetStorageRequirements can be called to compute the amount of memory
+    to allocate for the compressed output.
 */
-void CompressImage( u8 const* rgba, int width, int height, void* blocks, int flags );
+void CompressImage( u8 const* rgba, int width, int height, int pitch, void* blocks, int flags, float* metric = 0 );
+void CompressImage( u8 const* rgba, int width, int height, void* blocks, int flags, float* metric = 0 );

 // -----------------------------------------------------------------------------

@ -223,6 +250,7 @@ void CompressImage( u8 const* rgba, int width, int height, void* blocks, int fla
    @param rgba   Storage for the decompressed pixels.
    @param width  The width of the source image.
    @param height The height of the source image.
+    @param pitch  The pitch of the decompressed pixels.
    @param blocks The compressed DXT blocks.
    @param flags  Compression flags.

@ -231,17 +259,42 @@ void CompressImage( u8 const* rgba, int width, int height, void* blocks, int fla

        { r1, g1, b1, a1, .... , rn, gn, bn, an } for n = width*height

-	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+    The flags parameter should specify kDxt1, kDxt3, kDxt5, kBc4, or kBc5 compression,
    however, DXT1 will be used by default if none is specified. All other flags
    are ignored.

    Internally this function calls squish::Decompress for each block.
 */
+void DecompressImage( u8* rgba, int width, int height, int pitch, void const* blocks, int flags );
 void DecompressImage( u8* rgba, int width, int height, void const* blocks, int flags );

 // -----------------------------------------------------------------------------

+/*! @brief Computes MSE of an compressed image in memory.
+
+    @param rgba      The original image pixels.
+    @param width     The width of the source image.
+    @param height    The height of the source image.
+    @param pitch     The pitch of the source image.
+    @param dxt       The compressed dxt blocks
+    @param flags     Compression flags.
+    @param colourMSE The MSE of the colour values.
+    @param alphaMSE  The MSE of the alpha values.
+
+    The colour MSE and alpha MSE are computed across all pixels. The colour MSE is
+    averaged across all rgb values (i.e. colourMSE = sum sum_k ||dxt.k - rgba.k||/3)
+
+    The flags parameter should specify kDxt1, kDxt3, kDxt5, kBc4, or kBc5 compression,
+    however, DXT1 will be used by default if none is specified. All other flags
+    are ignored.
+
+    Internally this function calls squish::Decompress for each block.
+*/
+void ComputeMSE(u8 const *rgba, int width, int height, int pitch, u8 const *dxt, int flags, double &colourMSE, double &alphaMSE);
+void ComputeMSE(u8 const *rgba, int width, int height, u8 const *dxt, int flags, double &colourMSE, double &alphaMSE);
+
+// -----------------------------------------------------------------------------
+
 } // namespace squish

 #endif // ndef SQUISH_H
-