diff --git a/examples/src/Main.cpp b/examples/src/Main.cpp
index a7202c9..103f2c2 100644
--- a/examples/src/Main.cpp
+++ b/examples/src/Main.cpp
@@ -70,11 +70,13 @@ int main(int argc, const char **argv) try
 		//loader.Load(fileData.get(), "test_data/ad_hoc/KittyPurr16_Stereo.flac");
 		//loader.Load(fileData.get(), "test_data/ad_hoc/KittyPurr16_Mono.flac");
 		//loader.Load(fileData.get(), "test_data/ad_hoc/KittyPurr24_Stereo.flac");
+        //auto memory = ReadFile("test_data/ad_hoc/KittyPurr24_Stereo.flac"); // broken
+        //loader.Load(fileData.get(), "flac", memory.buffer); // broken
 
 		// Single-channel opus
 		//loader.Load(fileData.get(), "test_data/ad_hoc/detodos.opus"); // "Firefox: From All, To All"
 
-		// 1 + 2 channel wavepack
+		// 1 + 2 channel wavpack
 		//loader.Load(fileData.get(), "test_data/ad_hoc/TestBeat_Float32.wv");
 		//loader.Load(fileData.get(), "test_data/ad_hoc/TestBeat_Float32_Mono.wv");
 		//loader.Load(fileData.get(), "test_data/ad_hoc/TestBeat_Int16.wv");
@@ -110,7 +112,7 @@ int main(int argc, const char **argv) try
 	// Resample
 	std::vector<float> outputBuffer;
 	outputBuffer.reserve(fileData->samples.size());
-	linear_resample(44100.0 / 48000.0, fileData->samples, outputBuffer, fileData->samples.size());
+	linear_resample(44100.0 / 48000.0, fileData->samples, outputBuffer, (uint32_t) fileData->samples.size());
 
 	std::cout << "Input Samples: " << fileData->samples.size() << std::endl;
 	std::cout << "Output Samples: " << outputBuffer.size() << std::endl;
diff --git a/include/libnyquist/AudioDecoder.h b/include/libnyquist/AudioDecoder.h
index 69336f7..9200ea3 100644
--- a/include/libnyquist/AudioDecoder.h
+++ b/include/libnyquist/AudioDecoder.h
@@ -68,7 +68,9 @@ class NyquistIO
     void AddDecoderToTable(std::shared_ptr<nqr::BaseDecoder> decoder);
     std::map<std::string, std::shared_ptr<BaseDecoder>> decoderTable;
     NO_MOVE(NyquistIO);
+
 public:
+
     NyquistIO();
     ~NyquistIO();
     void Load(AudioData * data, const std::string & path);
diff --git a/include/libnyquist/Common.h b/include/libnyquist/Common.h
index 827c8b5..3641811 100644
--- a/include/libnyquist/Common.h
+++ b/include/libnyquist/Common.h
@@ -201,6 +201,57 @@ inline std::array<uint8_t, 3> Unpack(uint32_t a)
     return output;
 }
 
+//////////////////////////
+// Resampling Utilities //
+//////////////////////////
+
+// This is a naieve implementation of a resampling filter where a lerp is used as a bad low-pass.
+// It very far from the ideal case and should be used with caution (or not at all) on signals that matter.
+// It is included here to upsample 44.1k to 48k for the purposes of microphone input => Opus, where the the 
+// nominal frequencies of speech are particularly far from Nyquist.
+inline void linear_resample(const double rate, const std::vector<float> & input, std::vector<float> & output, const uint32_t samplesToProcess)
+{
+    double virtualReadIndex = 0;
+    double a, b, i, sample;
+    uint32_t n = samplesToProcess - 1;
+    while (n--)
+    {
+        uint32_t readIndex = static_cast<uint32_t>(virtualReadIndex);
+        i = virtualReadIndex - readIndex;
+        a = input[readIndex + 0];
+        b = input[readIndex + 1];
+        sample = (1.0 - i) * a + i * b; // linear interpolate
+        output.push_back(static_cast<float>(sample));
+        virtualReadIndex += rate;
+    }
+}
+
+inline double sample_hermite_4p_3o(double x, double * y)
+{
+    static double c0, c1, c2, c3;
+    c0 = y[1];
+    c1 = (1.0 / 2.0)*(y[2] - y[0]);
+    c2 = (y[0] - (5.0 / 2.0)*y[1]) + (2.0*y[2] - (1.0 / 2.0)*y[3]);
+    c3 = (1.0 / 2.0)*(y[3] - y[0]) + (3.0 / 2.0)*(y[1] - y[2]);
+    return ((c3*x + c2)*x + c1)*x + c0;
+}
+
+inline void hermite_resample(const double rate, const std::vector<float> & input, std::vector<float> & output, const uint32_t samplesToProcess)
+{
+    double virtualReadIndex = 1;
+    double i, sample;
+    uint32_t n = samplesToProcess - 1;
+    while (n--)
+    {
+        uint32_t readIndex = static_cast<uint32_t>(virtualReadIndex);
+        i = virtualReadIndex - readIndex;
+        double samps[4] = { input[readIndex - 1], input[readIndex], input[readIndex + 1], input[readIndex + 2] };
+        sample = sample_hermite_4p_3o(i, samps); // cubic hermite interpolate over 4 samples
+        output.push_back(static_cast<float>(sample));
+        virtualReadIndex += rate;
+    }
+}
+
 //////////////////////////
 // Conversion Utilities //
 //////////////////////////
diff --git a/include/libnyquist/IMA4Util.h b/include/libnyquist/IMA4Util.h
index 8fe684d..1549793 100644
--- a/include/libnyquist/IMA4Util.h
+++ b/include/libnyquist/IMA4Util.h
@@ -100,7 +100,7 @@ namespace nqr
         const uint8_t * data = state.inBuffer;
         
         // Loop over the interleaved channels
-        for (int32_t ch = 0; ch < num_channels; ch++)
+        for (uint32_t ch = 0; ch < num_channels; ch++)
         {
             const int byteOffset = ch * 4;
             
diff --git a/include/libnyquist/ModplugDecoder.h b/include/libnyquist/ModplugDecoder.h
index f0b152f..b8a6f4a 100644
--- a/include/libnyquist/ModplugDecoder.h
+++ b/include/libnyquist/ModplugDecoder.h
@@ -42,4 +42,4 @@ struct ModplugDecoder : public nqr::BaseDecoder
 
 } // end namespace nqr
 
-#endif
\ No newline at end of file
+#endif
diff --git a/include/libnyquist/WavEncoder.h b/include/libnyquist/WavEncoder.h
index f66a03b..dd1da47 100644
--- a/include/libnyquist/WavEncoder.h
+++ b/include/libnyquist/WavEncoder.h
@@ -32,53 +32,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 namespace nqr
 {
-	// This is a naieve implementation of a resampling filter where a lerp is used as a bad low-pass.
-	// It very far from the ideal case and should be used with caution (or not at all) on signals that matter.
-	// It is included here to upsample 44.1k to 48k for the purposes of microphone input => Opus, where the the 
-	// nominal frequencies of speech are particularly far from Nyquist.
-	static inline void linear_resample(const double rate, const std::vector<float> & input, std::vector<float> & output, size_t samplesToProcess)
-	{
-		double virtualReadIndex = 0;
-		double a, b, i, sample;
-		uint32_t n = samplesToProcess - 1;
-		while (n--)
-		{
-			uint32_t readIndex = static_cast<uint32_t>(virtualReadIndex);
-			i = virtualReadIndex - readIndex;
-			a = input[readIndex + 0]; 
-			b = input[readIndex + 1];
-			sample = (1.0 - i) * a + i * b; // linear interpolate
-			output.push_back(sample);
-			virtualReadIndex += rate;
-		}
-	}
-
-	static inline double sample_hermite_4p_3o(double x, double * y)
-	{
-		static double c0, c1, c2, c3;
-		c0 = y[1];
-		c1 = (1.0/2.0)*(y[2]-y[0]);
-		c2 = (y[0] - (5.0/2.0)*y[1]) + (2.0*y[2] - (1.0/2.0)*y[3]);
-		c3 = (1.0/2.0)*(y[3]-y[0]) + (3.0/2.0)*(y[1]-y[2]);
-		return ((c3*x+c2)*x+c1)*x+c0;
-	}
-
-	static inline void hermite_resample(const double rate, const std::vector<float> & input, std::vector<float> & output, size_t samplesToProcess)
-	{
-		double virtualReadIndex = 1;
-		double i, sample;
-		uint32_t n = samplesToProcess - 1;
-		while (n--)
-		{
-			uint32_t readIndex = static_cast<uint32_t>(virtualReadIndex);
-			i = virtualReadIndex - readIndex;
-			double samps[4] = {input[readIndex - 1], input[readIndex], input[readIndex + 1], input[readIndex + 2]};
-			sample = sample_hermite_4p_3o(i, samps); // cubic hermite interpolate over 4 samples
-			output.push_back(sample);
-			virtualReadIndex += rate;
-		}
-	}
-
 	enum EncoderError
 	{
 		NoError,
diff --git a/libnyquist.vcxproj/v140/libnyquist.vcxproj b/libnyquist.vcxproj/v140/libnyquist.vcxproj
index bdf8c43..94e6360 100644
--- a/libnyquist.vcxproj/v140/libnyquist.vcxproj
+++ b/libnyquist.vcxproj/v140/libnyquist.vcxproj
@@ -36,7 +36,6 @@
     <ClCompile Include="..\..\src\ModplugDependencies.cpp" />
     <ClCompile Include="..\..\src\MusepackDecoder.cpp" />
     <ClCompile Include="..\..\src\MusepackDependencies.c" />
-    <ClCompile Include="..\..\src\WavPackDependencies.c" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="$(ProjectDir)..\..\include\libnyquist\AudioDecoder.h" />
@@ -161,4 +160,4 @@
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
   </ImportGroup>
-</Project>
\ No newline at end of file
+</Project>
diff --git a/libnyquist.vcxproj/v140/libnyquist.vcxproj.filters b/libnyquist.vcxproj/v140/libnyquist.vcxproj.filters
index 50dac55..41ca4db 100644
--- a/libnyquist.vcxproj/v140/libnyquist.vcxproj.filters
+++ b/libnyquist.vcxproj/v140/libnyquist.vcxproj.filters
@@ -46,9 +46,6 @@
     <ClCompile Include="..\..\src\MusepackDecoder.cpp">
       <Filter>src</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\src\WavPackDependencies.c">
-      <Filter>src\deps</Filter>
-    </ClCompile>
     <ClCompile Include="..\..\src\ModplugDependencies.cpp">
       <Filter>src\deps</Filter>
     </ClCompile>
@@ -114,4 +111,4 @@
       <UniqueIdentifier>{d839471f-71d1-471e-95e0-c33af9bb64bc}</UniqueIdentifier>
     </Filter>
   </ItemGroup>
-</Project>
\ No newline at end of file
+</Project>
diff --git a/libnyquist.vcxproj/v141/libnyquist.vcxproj b/libnyquist.vcxproj/v141/libnyquist.vcxproj
index 9de6b6a..e090e96 100644
--- a/libnyquist.vcxproj/v141/libnyquist.vcxproj
+++ b/libnyquist.vcxproj/v141/libnyquist.vcxproj
@@ -35,7 +35,32 @@
     <ClCompile Include="..\..\src\ModplugDependencies.cpp" />
     <ClCompile Include="..\..\src\MusepackDecoder.cpp" />
     <ClCompile Include="..\..\src\MusepackDependencies.c" />
-    <ClCompile Include="..\..\src\WavPackDependencies.c" />
+    <ClCompile Include="..\..\third_party\wavpack\src\common_utils.c" />
+    <ClCompile Include="..\..\third_party\wavpack\src\decorr_utils.c" />
+    <ClCompile Include="..\..\third_party\wavpack\src\entropy_utils.c" />
+    <ClCompile Include="..\..\third_party\wavpack\src\extra1.c" />
+    <ClCompile Include="..\..\third_party\wavpack\src\extra2.c" />
+    <ClCompile Include="..\..\third_party\wavpack\src\open_filename.c" />
+    <ClCompile Include="..\..\third_party\wavpack\src\open_legacy.c" />
+    <ClCompile Include="..\..\third_party\wavpack\src\open_raw.c" />
+    <ClCompile Include="..\..\third_party\wavpack\src\open_utils.c" />
+    <ClCompile Include="..\..\third_party\wavpack\src\pack.c" />
+    <ClCompile Include="..\..\third_party\wavpack\src\pack_dns.c" />
+    <ClCompile Include="..\..\third_party\wavpack\src\pack_dsd.c" />
+    <ClCompile Include="..\..\third_party\wavpack\src\pack_floats.c" />
+    <ClCompile Include="..\..\third_party\wavpack\src\pack_utils.c" />
+    <ClCompile Include="..\..\third_party\wavpack\src\read_words.c" />
+    <ClCompile Include="..\..\third_party\wavpack\src\tags.c" />
+    <ClCompile Include="..\..\third_party\wavpack\src\tag_utils.c" />
+    <ClCompile Include="..\..\third_party\wavpack\src\unpack.c" />
+    <ClCompile Include="..\..\third_party\wavpack\src\unpack3.c" />
+    <ClCompile Include="..\..\third_party\wavpack\src\unpack3_open.c" />
+    <ClCompile Include="..\..\third_party\wavpack\src\unpack3_seek.c" />
+    <ClCompile Include="..\..\third_party\wavpack\src\unpack_dsd.c" />
+    <ClCompile Include="..\..\third_party\wavpack\src\unpack_floats.c" />
+    <ClCompile Include="..\..\third_party\wavpack\src\unpack_seek.c" />
+    <ClCompile Include="..\..\third_party\wavpack\src\unpack_utils.c" />
+    <ClCompile Include="..\..\third_party\wavpack\src\write_words.c" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="$(ProjectDir)..\..\include\libnyquist\AudioDecoder.h" />
@@ -106,8 +131,9 @@
       <WarningLevel>Level3</WarningLevel>
       <Optimization>Disabled</Optimization>
       <SDLCheck>true</SDLCheck>
-      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;_MBCS;D_SCL_SECURE_NO_WARNINGS;_SCL_SECURE_NO_WARNING;WIN32;_WIN32;USE_ALLOCA;OPUS_BUILD;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>DEBUG;_ITERATOR_DEBUG_LEVEL=0;_CRT_SECURE_NO_WARNINGS;_MBCS;D_SCL_SECURE_NO_WARNINGS;_SCL_SECURE_NO_WARNING;WIN32;_WIN32;USE_ALLOCA;OPUS_BUILD;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <AdditionalIncludeDirectories>$(ProjectDir)..\..\third_party\;$(ProjectDir)..\..\include\libnyquist\;$(ProjectDir)..\..\third_party\libvorbis\include;$(ProjectDir)..\..\third_party\libogg\include;$(ProjectDir)..\..\third_party\wavpack\include;$(ProjectDir)..\..\third_party\flac\src\include;$(ProjectDir)..\..\third_party\opus\celt;$(ProjectDir)..\..\third_party\opus\libopus\include;$(ProjectDir)..\..\third_party\opus\libopus\src;$(ProjectDir)..\..\third_party\opus\opusfile\include;$(ProjectDir)..\..\third_party\opus\opusfile\src;$(ProjectDir)..\..\third_party\opus\opusfile\src\include;$(ProjectDir)..\..\third_party\opus\silk;$(ProjectDir)..\..\third_party\opus\silk\float;$(ProjectDir)..\..\third_party\musepack\include;$(ProjectDir)..\..\third_party\musepack\libmpcenc;$(ProjectDir)..\..\third_party\musepack\libmpcdec;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
     </ClCompile>
     <Link>
       <GenerateDebugInformation>true</GenerateDebugInformation>
@@ -118,8 +144,9 @@
       <WarningLevel>Level3</WarningLevel>
       <Optimization>Disabled</Optimization>
       <SDLCheck>true</SDLCheck>
-      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;_MBCS;D_SCL_SECURE_NO_WARNINGS;_SCL_SECURE_NO_WARNING;WIN32;_WIN32;USE_ALLOCA;OPUS_BUILD;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>DEBUG;_ITERATOR_DEBUG_LEVEL=0;_CRT_SECURE_NO_WARNINGS;_MBCS;D_SCL_SECURE_NO_WARNINGS;_SCL_SECURE_NO_WARNING;WIN32;_WIN32;USE_ALLOCA;OPUS_BUILD;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <AdditionalIncludeDirectories>$(ProjectDir)..\..\third_party\;$(ProjectDir)..\..\include\libnyquist\;$(ProjectDir)..\..\third_party\libvorbis\include;$(ProjectDir)..\..\third_party\libogg\include;$(ProjectDir)..\..\third_party\wavpack\include;$(ProjectDir)..\..\third_party\flac\src\include;$(ProjectDir)..\..\third_party\opus\celt;$(ProjectDir)..\..\third_party\opus\libopus\include;$(ProjectDir)..\..\third_party\opus\libopus\src;$(ProjectDir)..\..\third_party\opus\opusfile\include;$(ProjectDir)..\..\third_party\opus\opusfile\src;$(ProjectDir)..\..\third_party\opus\opusfile\src\include;$(ProjectDir)..\..\third_party\opus\silk;$(ProjectDir)..\..\third_party\opus\silk\float;$(ProjectDir)..\..\third_party\musepack\include;$(ProjectDir)..\..\third_party\musepack\libmpcenc;$(ProjectDir)..\..\third_party\musepack\libmpcdec;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
     </ClCompile>
     <Link>
       <GenerateDebugInformation>true</GenerateDebugInformation>
@@ -134,6 +161,7 @@
       <SDLCheck>true</SDLCheck>
       <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;_MBCS;D_SCL_SECURE_NO_WARNINGS;_SCL_SECURE_NO_WARNING;WIN32;_WIN32;USE_ALLOCA;OPUS_BUILD;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <AdditionalIncludeDirectories>$(ProjectDir)..\..\third_party\;$(ProjectDir)..\..\include\libnyquist\;$(ProjectDir)..\..\third_party\libvorbis\include;$(ProjectDir)..\..\third_party\libogg\include;$(ProjectDir)..\..\third_party\wavpack\include;$(ProjectDir)..\..\third_party\flac\src\include;$(ProjectDir)..\..\third_party\opus\celt;$(ProjectDir)..\..\third_party\opus\libopus\include;$(ProjectDir)..\..\third_party\opus\libopus\src;$(ProjectDir)..\..\third_party\opus\opusfile\include;$(ProjectDir)..\..\third_party\opus\opusfile\src;$(ProjectDir)..\..\third_party\opus\opusfile\src\include;$(ProjectDir)..\..\third_party\opus\silk;$(ProjectDir)..\..\third_party\opus\silk\float;$(ProjectDir)..\..\third_party\musepack\include;$(ProjectDir)..\..\third_party\musepack\libmpcenc;$(ProjectDir)..\..\third_party\musepack\libmpcdec;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
     </ClCompile>
     <Link>
       <GenerateDebugInformation>true</GenerateDebugInformation>
@@ -150,6 +178,7 @@
       <SDLCheck>true</SDLCheck>
       <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;_MBCS;D_SCL_SECURE_NO_WARNINGS;_SCL_SECURE_NO_WARNING;WIN32;_WIN32;USE_ALLOCA;OPUS_BUILD;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <AdditionalIncludeDirectories>$(ProjectDir)..\..\third_party\;$(ProjectDir)..\..\include\libnyquist\;$(ProjectDir)..\..\third_party\libvorbis\include;$(ProjectDir)..\..\third_party\libogg\include;$(ProjectDir)..\..\third_party\wavpack\include;$(ProjectDir)..\..\third_party\flac\src\include;$(ProjectDir)..\..\third_party\opus\celt;$(ProjectDir)..\..\third_party\opus\libopus\include;$(ProjectDir)..\..\third_party\opus\libopus\src;$(ProjectDir)..\..\third_party\opus\opusfile\include;$(ProjectDir)..\..\third_party\opus\opusfile\src;$(ProjectDir)..\..\third_party\opus\opusfile\src\include;$(ProjectDir)..\..\third_party\opus\silk;$(ProjectDir)..\..\third_party\opus\silk\float;$(ProjectDir)..\..\third_party\musepack\include;$(ProjectDir)..\..\third_party\musepack\libmpcenc;$(ProjectDir)..\..\third_party\musepack\libmpcdec;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
     </ClCompile>
     <Link>
       <GenerateDebugInformation>true</GenerateDebugInformation>
diff --git a/libnyquist.vcxproj/v141/libnyquist.vcxproj.filters b/libnyquist.vcxproj/v141/libnyquist.vcxproj.filters
index c58377b..e7425ec 100644
--- a/libnyquist.vcxproj/v141/libnyquist.vcxproj.filters
+++ b/libnyquist.vcxproj/v141/libnyquist.vcxproj.filters
@@ -43,20 +43,92 @@
     <ClCompile Include="..\..\src\MusepackDecoder.cpp">
       <Filter>src</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\src\WavPackDependencies.c">
-      <Filter>src\deps</Filter>
-    </ClCompile>
     <ClCompile Include="..\..\src\ModplugDependencies.cpp">
       <Filter>src\deps</Filter>
     </ClCompile>
     <ClCompile Include="..\..\src\ModplugDecoder.cpp">
       <Filter>src</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\third_party\wavpack\src\entropy_utils.c">
+      <Filter>src\deps\WavpackDependencies</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party\wavpack\src\decorr_utils.c">
+      <Filter>src\deps\WavpackDependencies</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party\wavpack\src\open_legacy.c">
+      <Filter>src\deps\WavpackDependencies</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party\wavpack\src\open_filename.c">
+      <Filter>src\deps\WavpackDependencies</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party\wavpack\src\extra2.c">
+      <Filter>src\deps\WavpackDependencies</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party\wavpack\src\extra1.c">
+      <Filter>src\deps\WavpackDependencies</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party\wavpack\src\common_utils.c">
+      <Filter>src\deps\WavpackDependencies</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party\wavpack\src\open_utils.c">
+      <Filter>src\deps\WavpackDependencies</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party\wavpack\src\open_raw.c">
+      <Filter>src\deps\WavpackDependencies</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party\wavpack\src\pack.c">
+      <Filter>src\deps\WavpackDependencies</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party\wavpack\src\pack_dns.c">
+      <Filter>src\deps\WavpackDependencies</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party\wavpack\src\pack_floats.c">
+      <Filter>src\deps\WavpackDependencies</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party\wavpack\src\pack_dsd.c">
+      <Filter>src\deps\WavpackDependencies</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party\wavpack\src\pack_utils.c">
+      <Filter>src\deps\WavpackDependencies</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party\wavpack\src\read_words.c">
+      <Filter>src\deps\WavpackDependencies</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party\wavpack\src\tags.c">
+      <Filter>src\deps\WavpackDependencies</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party\wavpack\src\unpack.c">
+      <Filter>src\deps\WavpackDependencies</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party\wavpack\src\unpack_dsd.c">
+      <Filter>src\deps\WavpackDependencies</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party\wavpack\src\unpack_floats.c">
+      <Filter>src\deps\WavpackDependencies</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party\wavpack\src\tag_utils.c">
+      <Filter>src\deps\WavpackDependencies</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party\wavpack\src\unpack_seek.c">
+      <Filter>src\deps\WavpackDependencies</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party\wavpack\src\unpack_utils.c">
+      <Filter>src\deps\WavpackDependencies</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party\wavpack\src\unpack3.c">
+      <Filter>src\deps\WavpackDependencies</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party\wavpack\src\unpack3_open.c">
+      <Filter>src\deps\WavpackDependencies</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party\wavpack\src\unpack3_seek.c">
+      <Filter>src\deps\WavpackDependencies</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\third_party\wavpack\src\write_words.c">
+      <Filter>src\deps\WavpackDependencies</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
-    <ClInclude Include="$(ProjectDir)..\..\include\libnyquist\OpusDecoder.h">
-      <Filter>src</Filter>
-    </ClInclude>
     <ClInclude Include="$(ProjectDir)..\..\include\libnyquist\AudioDecoder.h">
       <Filter>include</Filter>
     </ClInclude>
@@ -93,6 +165,9 @@
     <ClInclude Include="..\..\include\libnyquist\IMA4Util.h">
       <Filter>include\util</Filter>
     </ClInclude>
+    <ClInclude Include="$(ProjectDir)..\..\include\libnyquist\OpusDecoder.h">
+      <Filter>include</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <Filter Include="include">
@@ -107,5 +182,8 @@
     <Filter Include="src\deps">
       <UniqueIdentifier>{d839471f-71d1-471e-95e0-c33af9bb64bc}</UniqueIdentifier>
     </Filter>
+    <Filter Include="src\deps\WavpackDependencies">
+      <UniqueIdentifier>{4ed41d64-0c09-4382-8ee9-70aad6043650}</UniqueIdentifier>
+    </Filter>
   </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/libnyquist.xcodeproj/project.pbxproj b/libnyquist.xcodeproj/project.pbxproj
index 157e941..323926d 100644
--- a/libnyquist.xcodeproj/project.pbxproj
+++ b/libnyquist.xcodeproj/project.pbxproj
@@ -22,7 +22,6 @@
 		08B91DA11AC73B8A00335131 /* OpusDecoder.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 08B91D971AC73B8A00335131 /* OpusDecoder.cpp */; };
 		08B91DA21AC73B8A00335131 /* WavDecoder.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 08B91D981AC73B8A00335131 /* WavDecoder.cpp */; };
 		08B91DA31AC73B8A00335131 /* WavPackDecoder.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 08B91D991AC73B8A00335131 /* WavPackDecoder.cpp */; };
-		08D0EC751C6DA41300FCDA23 /* WavPackDependencies.c in Sources */ = {isa = PBXBuildFile; fileRef = 08D0EC741C6DA41300FCDA23 /* WavPackDependencies.c */; };
 		08FFC72D1CA702EC005812D6 /* ModplugDependencies.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 08FFC72C1CA702EC005812D6 /* ModplugDependencies.cpp */; };
 		08FFC72F1CA7038D005812D6 /* ModplugDecoder.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 08FFC72E1CA7038D005812D6 /* ModplugDecoder.cpp */; };
 /* End PBXBuildFile section */
@@ -72,7 +71,6 @@
 		08B91D981AC73B8A00335131 /* WavDecoder.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = WavDecoder.cpp; path = src/WavDecoder.cpp; sourceTree = SOURCE_ROOT; };
 		08B91D991AC73B8A00335131 /* WavPackDecoder.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = WavPackDecoder.cpp; path = src/WavPackDecoder.cpp; sourceTree = SOURCE_ROOT; };
 		08C83B7C1C25D7780071EED6 /* IMA4Util.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = IMA4Util.h; path = include/libnyquist/IMA4Util.h; sourceTree = SOURCE_ROOT; };
-		08D0EC741C6DA41300FCDA23 /* WavPackDependencies.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = WavPackDependencies.c; path = src/WavPackDependencies.c; sourceTree = SOURCE_ROOT; };
 		08FFC72C1CA702EC005812D6 /* ModplugDependencies.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = ModplugDependencies.cpp; path = src/ModplugDependencies.cpp; sourceTree = SOURCE_ROOT; };
 		08FFC72E1CA7038D005812D6 /* ModplugDecoder.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = ModplugDecoder.cpp; path = src/ModplugDecoder.cpp; sourceTree = SOURCE_ROOT; };
 /* End PBXFileReference section */
@@ -92,7 +90,6 @@
 			isa = PBXGroup;
 			children = (
 				08FFC72C1CA702EC005812D6 /* ModplugDependencies.cpp */,
-				08D0EC741C6DA41300FCDA23 /* WavPackDependencies.c */,
 				081FFB181ADF803800673073 /* FlacDependencies.c */,
 				086DADAB1ADF9DF30031F793 /* VorbisDependencies.c */,
 				0804D13E1AE69F0100F4B1FD /* OpusDependencies.c */,
@@ -264,7 +261,6 @@
 				08FFC72D1CA702EC005812D6 /* ModplugDependencies.cpp in Sources */,
 				08B91D9E1AC73B8A00335131 /* FlacDecoder.cpp in Sources */,
 				08FFC72F1CA7038D005812D6 /* ModplugDecoder.cpp in Sources */,
-				08D0EC751C6DA41300FCDA23 /* WavPackDependencies.c in Sources */,
 				086DADAD1AE029860031F793 /* VorbisDependencies.c in Sources */,
 				08B91DA31AC73B8A00335131 /* WavPackDecoder.cpp in Sources */,
 				08B91D9D1AC73B8A00335131 /* Common.cpp in Sources */,
diff --git a/src/AudioDecoder.cpp b/src/AudioDecoder.cpp
index 4082efe..df99b80 100644
--- a/src/AudioDecoder.cpp
+++ b/src/AudioDecoder.cpp
@@ -134,4 +134,4 @@ void NyquistIO::BuildDecoderTable()
     AddDecoderToTable(std::make_shared<OpusDecoder>());
     AddDecoderToTable(std::make_shared<MusepackDecoder>());
     AddDecoderToTable(std::make_shared<ModplugDecoder>());
-}
\ No newline at end of file
+}
diff --git a/src/Common.cpp b/src/Common.cpp
index 1309659..5841c28 100644
--- a/src/Common.cpp
+++ b/src/Common.cpp
@@ -24,6 +24,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
 #include "Common.h"
+#include <cstring>
 
 using namespace nqr;
 
@@ -104,9 +105,10 @@ void nqr::ConvertToFloat32(float * dst, const uint8_t * src, const size_t N, PCM
     
     else if (f == PCM_FLT)
     {
-        const float * dataPtr = reinterpret_cast<const float *>(src);
+        memcpy(dst, src, N * sizeof(float));
+        /* const float * dataPtr = reinterpret_cast<const float *>(src);
         for (size_t i = 0; i < N; ++i)
-            dst[i] = (float) Read32(dataPtr[i]);
+            dst[i] = (float) Read32(dataPtr[i]); */
     }
     else if (f == PCM_DBL)
     {
diff --git a/src/FlacDecoder.cpp b/src/FlacDecoder.cpp
index 78823f9..b89cd5f 100644
--- a/src/FlacDecoder.cpp
+++ b/src/FlacDecoder.cpp
@@ -24,26 +24,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
 #include "FlacDecoder.h"
-#include "flac/all.h"
-#include "flac/stream_decoder.h"
+#include "FLAC/all.h"
+#include "FLAC/stream_decoder.h"
 
 #include "AudioDecoder.h"
+#include <cstring>
 
 using namespace nqr;
 
+// FLAC is a big-endian format. All values are unsigned.
+
 class FlacDecoderInternal
 {
     
 public:
     
-    // FLAC is a big-endian format. All values are unsigned.
-    FlacDecoderInternal(AudioData * d, std::string filepath) : d(d)
+    FlacDecoderInternal(AudioData * d, const std::string & filepath) : d(d)
     {
-        
-        /////////////////////////////
-        // Initialize FLAC library //
-        /////////////////////////////
-        
         decoderInternal = FLAC__stream_decoder_new();
         
         FLAC__stream_decoder_set_metadata_respond(decoderInternal, FLAC__METADATA_TYPE_STREAMINFO);
@@ -57,10 +54,49 @@ public:
                                                           this) == FLAC__STREAM_DECODER_INIT_STATUS_OK;
         
         FLAC__stream_decoder_set_md5_checking(decoderInternal, true);
+
+        if (initialized)
+        {
+            // Find the size and allocate memory
+            FLAC__stream_decoder_process_until_end_of_metadata(decoderInternal);
+            
+            // Read memory out into our temporary internalBuffer
+            FLAC__stream_decoder_process_until_end_of_stream(decoderInternal);
+
+            // Presently unneeded, but useful for reference
+            // FLAC__ChannelAssignment channelAssignment = FLAC__stream_decoder_get_channel_assignment(decoderInternal);
+            
+            // Fill out remaining user data
+            d->lengthSeconds = (float) numSamples / (float) d->sampleRate;
+            
+            auto totalSamples = numSamples * d->channelCount;
+
+            // Next, process internal buffer into the user-visible samples array
+            ConvertToFloat32(d->samples.data(), internalBuffer.data(), totalSamples, d->sourceFormat);
+        }
+        else throw std::runtime_error("Unable to initialize FLAC decoder");
+    }
+
+    FlacDecoderInternal(AudioData * d, const std::vector<uint8_t> & memory) : d(d), data(std::move(memory)), dataPos(0)
+    {
+        decoderInternal = FLAC__stream_decoder_new();
         
-        //////////////////////
-        // Read Stream Data //
-        /////////////////////
+        FLAC__stream_decoder_set_metadata_respond(decoderInternal, FLAC__METADATA_TYPE_STREAMINFO);
+        
+        bool initialized = FLAC__stream_decoder_init_stream(
+          decoderInternal,
+          read_callback,
+          seek_callback,
+          tell_callback,
+          length_callback,
+          eof_callback,
+          s_writeCallback,
+          s_metadataCallback,
+          s_errorCallback,
+          this
+        ) == FLAC__STREAM_DECODER_INIT_STATUS_OK;
+        
+        FLAC__stream_decoder_set_md5_checking(decoderInternal, true);
         
         if (initialized)
         {
@@ -81,12 +117,7 @@ public:
             // Next, process internal buffer into the user-visible samples array
             ConvertToFloat32(d->samples.data(), internalBuffer.data(), totalSamples, d->sourceFormat);
         }
-        
-        else
-        {
-            throw std::runtime_error("Unable to initialize FLAC decoder");
-        }
-        
+        else throw std::runtime_error("Unable to initialize FLAC decoder");
     }
     
     ~FlacDecoderInternal()
@@ -121,16 +152,14 @@ public:
     static FLAC__StreamDecoderWriteStatus s_writeCallback(const FLAC__StreamDecoder *, const FLAC__Frame* frame, const FLAC__int32 * const buffer[], void * userPtr)
     {
         FlacDecoderInternal * decoder = reinterpret_cast<FlacDecoderInternal *>(userPtr);
-        
         const size_t bytesPerSample = GetFormatBitsPerSample(decoder->d->sourceFormat) / 8;
-        
         auto dataPtr = decoder->internalBuffer.data();
         
         for (uint32_t i = 0;  i < frame->header.blocksize; i++)
         {
-            for(int j = 0; j < decoder->d->channelCount; j++)
+            for (int j = 0; j < decoder->d->channelCount; j++)
             {
-                memcpy(dataPtr + decoder->bufferPosition, &buffer[j][i], bytesPerSample);
+                std::memcpy(dataPtr + decoder->bufferPosition, &buffer[j][i], bytesPerSample);
                 decoder->bufferPosition += bytesPerSample;
             }
         }
@@ -145,7 +174,51 @@ public:
     
     static void s_errorCallback (const FLAC__StreamDecoder *, FLAC__StreamDecoderErrorStatus status, void *)
     {
-        std::cerr << "FLAC Decoder Error: " << FLAC__StreamDecoderErrorStatusString[status] << std::endl;
+        throw std::runtime_error("FLAC decode exception " + std::string(FLAC__StreamDecoderErrorStatusString[status]));
+    }
+
+    static FLAC__StreamDecoderReadStatus read_callback(const FLAC__StreamDecoder *decoder, FLAC__byte buffer[], size_t *bytes, void *client_data) 
+    {
+        FlacDecoderInternal *decoderInternal = (FlacDecoderInternal *)client_data;
+        size_t readLength = std::min<size_t>(*bytes, decoderInternal->data.size() - decoderInternal->dataPos);
+
+        if (readLength > 0) 
+        {
+            std::memcpy(buffer, decoderInternal->data.data(), readLength);
+            decoderInternal->dataPos += readLength;
+            *bytes = readLength;
+            if (decoderInternal->dataPos < decoderInternal->data.size()) return FLAC__STREAM_DECODER_READ_STATUS_CONTINUE;
+            else return FLAC__STREAM_DECODER_READ_STATUS_END_OF_STREAM;
+        }
+        else return FLAC__STREAM_DECODER_READ_STATUS_END_OF_STREAM;
+    }
+
+    static FLAC__StreamDecoderSeekStatus seek_callback(const FLAC__StreamDecoder *decoder, FLAC__uint64 absolute_byte_offset, void *client_data) 
+    {
+        FlacDecoderInternal *decoderInternal = (FlacDecoderInternal *)client_data;
+        size_t newPos = std::min<size_t>(absolute_byte_offset, decoderInternal->data.size() - decoderInternal->dataPos);
+        decoderInternal->dataPos = newPos;
+        return FLAC__STREAM_DECODER_SEEK_STATUS_OK;
+    }
+
+    static FLAC__StreamDecoderTellStatus tell_callback(const FLAC__StreamDecoder *decoder, FLAC__uint64 *absolute_byte_offset, void *client_data) 
+    {
+        FlacDecoderInternal *decoderInternal = (FlacDecoderInternal *)client_data;
+        *absolute_byte_offset = decoderInternal->dataPos;
+        return FLAC__STREAM_DECODER_TELL_STATUS_OK;
+    }
+
+    static FLAC__StreamDecoderLengthStatus length_callback(const FLAC__StreamDecoder *decoder, FLAC__uint64 *stream_length, void *client_data) 
+    {
+        FlacDecoderInternal *decoderInternal = (FlacDecoderInternal *)client_data;
+        *stream_length = decoderInternal->data.size();
+        return FLAC__STREAM_DECODER_LENGTH_STATUS_OK;
+    }
+
+    static FLAC__bool eof_callback(const FLAC__StreamDecoder *decoder, void *client_data) 
+    {
+        FlacDecoderInternal *decoderInternal = (FlacDecoderInternal *)client_data;
+        return decoderInternal->dataPos == decoderInternal->data.size();
     }
     
 private:
@@ -153,7 +226,8 @@ private:
     NO_COPY(FlacDecoderInternal);
     
     FLAC__StreamDecoder * decoderInternal;
-    
+    std::vector<uint8_t> data;
+    size_t dataPos;
     size_t bufferPosition = 0;
     size_t numSamples = 0;
     
@@ -173,10 +247,10 @@ void FlacDecoder::LoadFromPath(AudioData * data, const std::string & path)
 
 void FlacDecoder::LoadFromBuffer(AudioData * data, const std::vector<uint8_t> & memory)
 {
-    throw LoadBufferNotImplEx();
+    FlacDecoderInternal decoder(data, memory);
 }
 
 std::vector<std::string> FlacDecoder::GetSupportedFileExtensions()
 {
     return {"flac"};
-}
\ No newline at end of file
+}
diff --git a/src/FlacDependencies.c b/src/FlacDependencies.c
index 772075a..0cb8523 100644
--- a/src/FlacDependencies.c
+++ b/src/FlacDependencies.c
@@ -77,25 +77,25 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     
 #define HAVE_LROUND 1
 
-#include "flac/all.h"
+#include "FLAC/all.h"
 
 #if defined(_MSC_VER)
-#include "flac/src/win_utf8_io.c"
+#include "FLAC/src/win_utf8_io.c"
 #endif
 
-#include "flac/src/bitmath.c"
-#include "flac/src/bitreader.c"
-#include "flac/src/bitwriter.c"
-#include "flac/src/cpu.c"
-#include "flac/src/crc.c"
-#include "flac/src/fixed.c"
-#include "flac/src/float.c"
-#include "flac/src/format.c"
-#include "flac/src/lpc.c"
-#include "flac/src/md5.c"
-#include "flac/src/memory.c"
-#include "flac/src/stream_decoder.c"
-#include "flac/src/window.c"
+#include "FLAC/src/bitmath.c"
+#include "FLAC/src/bitreader.c"
+#include "FLAC/src/bitwriter.c"
+#include "FLAC/src/cpu.c"
+#include "FLAC/src/crc.c"
+#include "FLAC/src/fixed.c"
+#include "FLAC/src/float.c"
+#include "FLAC/src/format.c"
+#include "FLAC/src/lpc.c"
+#include "FLAC/src/md5.c"
+#include "FLAC/src/memory.c"
+#include "FLAC/src/stream_decoder.c"
+#include "FLAC/src/window.c"
 
 #undef VERSION
 
@@ -105,4 +105,4 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if (_MSC_VER)
 #pragma warning (pop)
-#endif
\ No newline at end of file
+#endif
diff --git a/src/ModplugDecoder.cpp b/src/ModplugDecoder.cpp
index badd2c4..7d277cd 100644
--- a/src/ModplugDecoder.cpp
+++ b/src/ModplugDecoder.cpp
@@ -53,10 +53,7 @@ public:
         ModPlug_SetSettings(&mps);
         
         mpf = ModPlug_Load((const void*)fileData.data(), fileData.size());
-        if (!mpf)
-        {
-            throw std::runtime_error("could not load module");
-        }
+        if (!mpf) throw std::runtime_error("could not load module");
         
         d->sampleRate = 44100;
         d->channelCount = 2;
@@ -68,43 +65,46 @@ public:
         auto totalSamples = (44100LL * len_ms) / 1000;
         d->samples.resize(totalSamples * d->channelCount);
 
-        auto readInternal = [&]()
+        auto read_func = [&]()
         {
             const float invf = 1 / (float)0x7fffffff;
-        	float *ptr = d->samples.data();
-        	float *end = d->samples.data() + d->samples.size();
+            float *ptr = d->samples.data();
+            float *end = d->samples.data() + d->samples.size();
 
-        	while( ptr < end ) {
-	            int res = ModPlug_Read( mpf, (void*)ptr, (end - ptr) * sizeof(float) );
-	            int samples_read = res / (sizeof(float) * 2);
+            while (ptr < end) 
+            {
+                int res = ModPlug_Read(mpf, (void*)ptr, (end - ptr) * sizeof(float));
+                int samples_read = res / (sizeof(float) * 2);
 
-	            if( totalSamples < samples_read ) {
-	            	samples_read = totalSamples;
-	            }
+                if (totalSamples < samples_read) 
+                {
+                    samples_read = totalSamples;
+                }
 
-	            for( int i = 0; i < samples_read; ++i ) {
-	                *ptr++ = *((int*)ptr) * invf;
-	                *ptr++ = *((int*)ptr) * invf;
-	            }
+                for (int i = 0; i < samples_read; ++i) 
+                {
+                    *ptr++ = *((int*)ptr) * invf;
+                    *ptr++ = *((int*)ptr) * invf;
+                }
 
-	            totalSamples -= samples_read;
-        	}
+                totalSamples -= samples_read;
+            }
 
             return ptr >= end;
         };
 
-        if (!readInternal())
+        if (!read_func())
+        {
             throw std::runtime_error("could not read any data");
+        }
 
         ModPlug_Unload(mpf);
     }
     
 private:
 
-    ModPlugFile* mpf;
-    
+    ModPlugFile * mpf;
     NO_MOVE(ModplugInternal);
-    
     AudioData * d;
 };
 
@@ -128,4 +128,3 @@ std::vector<std::string> ModplugDecoder::GetSupportedFileExtensions()
 {
     return {"pat","mid", "mod","s3m","xm","it","669","amf","ams","dbm","dmf","dsm","far","mdl","med","mtm","okt","ptm","stm","ult","umx","mt2","psm"};
 }
-
diff --git a/src/MusepackDecoder.cpp b/src/MusepackDecoder.cpp
index f95bbea..494e1de 100644
--- a/src/MusepackDecoder.cpp
+++ b/src/MusepackDecoder.cpp
@@ -110,9 +110,7 @@ public:
         reader.tell = tell_mem;
         
         mpcDemux = mpc_demux_init(&reader);
-        
-        if (!mpcDemux)
-            throw std::runtime_error("could not initialize mpc demuxer");
+        if (!mpcDemux) throw std::runtime_error("could not initialize mpc demuxer");
         
         mpc_demux_get_info(mpcDemux, &streamInfo);
         
@@ -190,4 +188,4 @@ void MusepackDecoder::LoadFromBuffer(AudioData * data, const std::vector<uint8_t
 std::vector<std::string> MusepackDecoder::GetSupportedFileExtensions()
 {
     return {"mpc", "mpp"};
-}
\ No newline at end of file
+}
diff --git a/src/OpusDecoder.cpp b/src/OpusDecoder.cpp
index 16d51f8..f720694 100644
--- a/src/OpusDecoder.cpp
+++ b/src/OpusDecoder.cpp
@@ -174,4 +174,4 @@ void nqr::OpusDecoder::LoadFromBuffer(AudioData * data, const std::vector<uint8_
 std::vector<std::string> nqr::OpusDecoder::GetSupportedFileExtensions()
 {
     return {"opus"};
-}
\ No newline at end of file
+}
diff --git a/src/WavDecoder.cpp b/src/WavDecoder.cpp
index 6dace0f..57dab64 100644
--- a/src/WavDecoder.cpp
+++ b/src/WavDecoder.cpp
@@ -26,6 +26,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "WavDecoder.h"
 #include "RiffUtils.h"
 #include "IMA4Util.h"
+#include <cstring>
 
 using namespace nqr;
 
@@ -200,7 +201,7 @@ void WavDecoder::LoadFromBuffer(AudioData * data, const std::vector<uint8_t> & m
         uint32_t frameOffset = 0;
         uint32_t frameCount = DataChunkInfo.size / s.frame_size;
 
-        for (int i = 0; i < frameCount; ++i)
+        for (uint32_t i = 0; i < frameCount; ++i)
         {
             decode_ima_adpcm(s, adpcm_pcm16.data() + frameOffset, wavHeader.channel_count);
             s.inBuffer += s.frame_size;
diff --git a/src/WavEncoder.cpp b/src/WavEncoder.cpp
index a56cbd9..f45b0e4 100644
--- a/src/WavEncoder.cpp
+++ b/src/WavEncoder.cpp
@@ -79,28 +79,25 @@ int WavEncoder::WriteFile(const EncoderParams p, const AudioData * d, const std:
 		sampleDataOptionalMix.resize(sampleDataSize * 2);
 		MonoToStereo(sampleData, sampleDataOptionalMix.data(), sampleDataSize); // Mix
 
-																				// Re-point data
+        // Re-point data
 		sampleData = sampleDataOptionalMix.data();
 		sampleDataSize = sampleDataOptionalMix.size();
 	}
-
 	// Stereo => Mono
 	else if (d->channelCount == 2 && p.channelCount == 1)
 	{
 		sampleDataOptionalMix.resize(sampleDataSize / 2);
 		StereoToMono(sampleData, sampleDataOptionalMix.data(), sampleDataSize); // Mix
 
-																				// Re-point data
+        // Re-point data
 		sampleData = sampleDataOptionalMix.data();
 		sampleDataSize = sampleDataOptionalMix.size();
 
 	}
-
 	else if (d->channelCount == p.channelCount)
 	{
 		// No op
 	}
-
 	else
 	{
 		return EncoderError::UnsupportedChannelMix;
diff --git a/src/WavPackDecoder.cpp b/src/WavPackDecoder.cpp
index 63d3a9f..abdbfc9 100644
--- a/src/WavPackDecoder.cpp
+++ b/src/WavPackDecoder.cpp
@@ -25,6 +25,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "WavPackDecoder.h"
 #include "wavpack.h"
+#include <string.h>
 
 using namespace nqr;
 
@@ -33,11 +34,67 @@ class WavPackInternal
     
 public:
     
-    WavPackInternal(AudioData * d, const std::string path) : d(d)
+    WavPackInternal(AudioData * d, const std::string & path) : d(d)
     {
         char errorStr[128];
         context = WavpackOpenFileInput(path.c_str(), errorStr, OPEN_WVC | OPEN_NORMALIZE, 0);
         
+        if (!context) throw std::runtime_error("Not a WavPack file");
+        
+        auto bitdepth = WavpackGetBitsPerSample(context);
+        
+        d->sampleRate = WavpackGetSampleRate(context);
+        d->channelCount = WavpackGetNumChannels(context);
+        d->lengthSeconds = double(getLengthInSeconds());
+        d->frameSize = d->channelCount * bitdepth;
+        
+        //@todo support channel masks
+        // WavpackGetChannelMask
+        
+        auto totalSamples = size_t(getTotalSamples());
+        
+        int mode = WavpackGetMode(context);
+        bool isFloatingPoint = (MODE_FLOAT & mode);
+        
+        d->sourceFormat = MakeFormatForBits(bitdepth, isFloatingPoint, false);
+
+        /// From the  WavPack docs:
+        /// "... required memory at "buffer" is 4 * samples * num_channels bytes. The
+        /// audio data is returned right-justified in 32-bit longs in the endian
+        /// mode native to the executing processor."
+        d->samples.resize(totalSamples * d->channelCount);
+        
+        if (!isFloatingPoint)
+            internalBuffer.resize(totalSamples * d->channelCount);
+        
+        if (!readInternal(totalSamples))
+            throw std::runtime_error("could not read any data");
+        
+        // Next, process internal buffer into the user-visible samples array
+        if (!isFloatingPoint)
+            ConvertToFloat32(d->samples.data(), internalBuffer.data(), totalSamples * d->channelCount, d->sourceFormat);
+        
+    }
+
+    WavPackInternal(AudioData * d, const std::vector<uint8_t> & memory) : d(d), data(std::move(memory)), dataPos(0)
+    {
+        WavpackStreamReader64 reader = 
+        {
+            read_bytes,
+            write_bytes,
+            get_pos,
+            set_pos_abs,
+            set_pos_rel,
+            push_back_byte,
+            get_length,
+            can_seek,
+            truncate_here,
+            close,
+        };
+
+        char errorStr[128];
+        context = WavpackOpenFileInputEx64(&reader, this, nullptr, errorStr, OPEN_WVC | OPEN_NORMALIZE, 0);
+        
         if (!context)
         {
             throw std::runtime_error("Not a WavPack file");
@@ -60,11 +117,6 @@ public:
         
         d->sourceFormat = MakeFormatForBits(bitdepth, isFloatingPoint, false);
 
-        /* From the docs:
-            "... required memory at "buffer" is 4 * samples * num_channels bytes. The
-            audio data is returned right-justified in 32-bit longs in the endian
-            mode native to the executing processor."
-        */
         d->samples.resize(totalSamples * d->channelCount);
         
         if (!isFloatingPoint)
@@ -89,8 +141,6 @@ public:
         size_t framesRemaining = requestedFrameCount;
         size_t totalFramesRead = 0;
         
-        // int frameSize = d->channelCount * WavpackGetBitsPerSample(context);
-        
         // The samples returned are handled differently based on the file's mode
         int mode = WavpackGetMode(context);
         
@@ -111,8 +161,7 @@ public:
             }
             
             // EOF
-            //if (framesRead == 0)
-            //    break;
+            //if (framesRead == 0) break;
             
             totalFramesRead += framesRead;
             framesRemaining -= framesRead;
@@ -120,6 +169,115 @@ public:
         
         return totalFramesRead;
     }
+
+    static int32_t read_bytes(void * id, void * data, int32_t byte_count) 
+    {
+        if (id != nullptr) 
+        {
+            WavPackInternal *decoder = (WavPackInternal *)id;
+            int32_t readLength = std::min<size_t>(byte_count, decoder->data.size() - decoder->dataPos);
+            if (readLength > 0) 
+            {
+                std::memcpy(data, decoder->data.data(), readLength);
+                decoder->dataPos += readLength;
+                return readLength;
+            } 
+            else return 0;
+        } 
+        return 0;
+    }
+    static int32_t write_bytes(void * id, void * data, int32_t byte_count)
+    {
+        if (id != nullptr) 
+        {
+            WavPackInternal *decoder = (WavPackInternal *)id;
+            int32_t writeLength = std::min<size_t>(byte_count, decoder->data.size() - decoder->dataPos);
+            if (writeLength > 0) 
+            {
+                std::memcpy(decoder->data.data(), data, writeLength);
+                decoder->dataPos += writeLength;
+                return writeLength;
+            } 
+            else return 0;
+        } 
+        return 0;
+    }
+    static int64_t get_pos(void *id) 
+    {
+        if (id != nullptr) 
+        {
+            WavPackInternal *decoder = (WavPackInternal *)id;
+            return decoder->dataPos;
+        }
+        return 0;
+    }
+    static int set_pos_abs(void *id, int64_t pos) 
+    {
+        if (id != nullptr) 
+        {
+            WavPackInternal *decoder = (WavPackInternal *)id;
+            size_t newPos = std::min<size_t>(pos, decoder->data.size());
+            decoder->dataPos = newPos;
+            return newPos;
+        } 
+        return 0;
+    }
+    static int set_pos_rel(void *id, int64_t delta, int mode) 
+    {
+        if (id != nullptr) 
+        {
+            WavPackInternal *decoder = (WavPackInternal *)id;
+            size_t newPos = 0;
+            if (mode == SEEK_SET) newPos = delta;
+            else if (mode == SEEK_CUR) newPos = decoder->dataPos + delta;
+            else if (mode == SEEK_END) newPos = decoder->data.size() + delta;
+            newPos = std::min<size_t>(newPos, decoder->data.size());
+            decoder->dataPos = newPos;
+            return newPos;
+        } 
+        return 0;
+    }
+    static int push_back_byte(void *id, int c) 
+    {
+        if (id != nullptr) 
+        {
+            WavPackInternal *decoder = (WavPackInternal *)id;
+            decoder->dataPos--;
+            decoder->data[decoder->dataPos] = c;
+            return 1;
+        } 
+        return 0;
+    }
+    static int64_t get_length(void *id) 
+    {
+        if (id != nullptr) 
+        {
+            WavPackInternal *decoder = (WavPackInternal *)id;
+            return decoder->data.size();
+        } 
+        return 0;
+    }
+    static int can_seek(void *id) 
+    {
+        if (id != nullptr) return 1;
+        return 0;
+    }
+
+    static int truncate_here(void *id) 
+    {
+        if (id != nullptr) 
+        {
+            WavPackInternal *decoder = (WavPackInternal *)id;
+            decoder->data.resize(decoder->dataPos);
+            return 1;
+        } 
+        return 0;
+    }
+    static int close(void *id) 
+    {
+        if (id != nullptr) return 1;
+        return 0;
+    }
     
 private:
     
@@ -130,6 +288,8 @@ private:
     WavpackContext * context; //@todo unique_ptr
     
     AudioData * d;
+    std::vector<uint8_t> data;
+    size_t dataPos;
     
     std::vector<int32_t> internalBuffer;
     
@@ -149,7 +309,7 @@ void WavPackDecoder::LoadFromPath(AudioData * data, const std::string & path)
 
 void WavPackDecoder::LoadFromBuffer(AudioData * data, const std::vector<uint8_t> & memory)
 {
-    throw LoadBufferNotImplEx();
+    WavPackInternal decoder(data, memory);
 }
 
 std::vector<std::string> WavPackDecoder::GetSupportedFileExtensions()
diff --git a/src/WavPackDependencies.c b/src/WavPackDependencies.c
deleted file mode 100644
index 1fec6d0..0000000
--- a/src/WavPackDependencies.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
-Copyright (c) 2015, Dimitri Diakopoulos All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-* Redistributions of source code must retain the above copyright notice, this
-  list of conditions and the following disclaimer.
-
-* Redistributions in binary form must reproduce the above copyright notice,
-  this list of conditions and the following disclaimer in the documentation
-  and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if (_MSC_VER)
-    #pragma warning (push)
-    #pragma warning (disable: 181 111 4267 4996 4244 4701 4702 4133 4100 4127 4206 4312 4505 4365 4005 4013 4334 4703)
-#endif
-        
-#ifdef __clang__
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wconversion"
-    #pragma clang diagnostic ignored "-Wshadow"
-    #pragma clang diagnostic ignored "-Wdeprecated-register"
-#endif
-
-#ifdef _WIN32
-    #ifndef WIN32
-        #define WIN32
-    #endif
-#endif
-
-#include "wavpack/src/bits.c"
-#include "wavpack/src/extra1.c"
-#define WavpackExtraInfo WavpackExtraInfo_alt
-#define log2overhead log2overhead_alt
-#define xtable xtable_alt
-#include "wavpack/src/extra2.c"
-#include "wavpack/src/float.c"
-#include "wavpack/src/metadata.c"
-#define decorr_stereo_pass decorr_stereo_pass_alt
-#include "wavpack/src/pack.c"
-#include "wavpack/src/tags.c"
-#undef  decorr_stereo_pass
-#define decorr_stereo_pass decorr_stereo_pass_alt_2
-#include "wavpack/src/unpack.c"
-#include "wavpack/src/unpack3.c"
-#include "wavpack/src/words.c"
-#include "wavpack/src/wputils.c"
-
-#ifdef __clang__
-    #pragma clang diagnostic pop
-#endif
-
-#if (_MSC_VER)
-    #pragma warning (pop)
-#endif
diff --git a/third_party/flac/all.h b/third_party/FLAC/all.h
similarity index 100%
rename from third_party/flac/all.h
rename to third_party/FLAC/all.h
diff --git a/third_party/flac/assert.h b/third_party/FLAC/assert.h
similarity index 100%
rename from third_party/flac/assert.h
rename to third_party/FLAC/assert.h
diff --git a/third_party/flac/callback.h b/third_party/FLAC/callback.h
similarity index 100%
rename from third_party/flac/callback.h
rename to third_party/FLAC/callback.h
diff --git a/third_party/flac/export.h b/third_party/FLAC/export.h
similarity index 100%
rename from third_party/flac/export.h
rename to third_party/FLAC/export.h
diff --git a/third_party/flac/format.h b/third_party/FLAC/format.h
similarity index 100%
rename from third_party/flac/format.h
rename to third_party/FLAC/format.h
diff --git a/third_party/flac/metadata.h b/third_party/FLAC/metadata.h
similarity index 100%
rename from third_party/flac/metadata.h
rename to third_party/FLAC/metadata.h
diff --git a/third_party/flac/ordinals.h b/third_party/FLAC/ordinals.h
similarity index 100%
rename from third_party/flac/ordinals.h
rename to third_party/FLAC/ordinals.h
diff --git a/third_party/flac/src/bitmath.c b/third_party/FLAC/src/bitmath.c
similarity index 100%
rename from third_party/flac/src/bitmath.c
rename to third_party/FLAC/src/bitmath.c
diff --git a/third_party/flac/src/bitreader.c b/third_party/FLAC/src/bitreader.c
similarity index 100%
rename from third_party/flac/src/bitreader.c
rename to third_party/FLAC/src/bitreader.c
diff --git a/third_party/flac/src/bitwriter.c b/third_party/FLAC/src/bitwriter.c
similarity index 100%
rename from third_party/flac/src/bitwriter.c
rename to third_party/FLAC/src/bitwriter.c
diff --git a/third_party/flac/src/cpu.c b/third_party/FLAC/src/cpu.c
similarity index 100%
rename from third_party/flac/src/cpu.c
rename to third_party/FLAC/src/cpu.c
diff --git a/third_party/flac/src/crc.c b/third_party/FLAC/src/crc.c
similarity index 100%
rename from third_party/flac/src/crc.c
rename to third_party/FLAC/src/crc.c
diff --git a/third_party/flac/src/fixed.c b/third_party/FLAC/src/fixed.c
similarity index 100%
rename from third_party/flac/src/fixed.c
rename to third_party/FLAC/src/fixed.c
diff --git a/third_party/flac/src/fixed_intrin_sse2.c b/third_party/FLAC/src/fixed_intrin_sse2.c
similarity index 100%
rename from third_party/flac/src/fixed_intrin_sse2.c
rename to third_party/FLAC/src/fixed_intrin_sse2.c
diff --git a/third_party/flac/src/fixed_intrin_ssse3.c b/third_party/FLAC/src/fixed_intrin_ssse3.c
similarity index 100%
rename from third_party/flac/src/fixed_intrin_ssse3.c
rename to third_party/FLAC/src/fixed_intrin_ssse3.c
diff --git a/third_party/flac/src/float.c b/third_party/FLAC/src/float.c
similarity index 100%
rename from third_party/flac/src/float.c
rename to third_party/FLAC/src/float.c
diff --git a/third_party/flac/src/format.c b/third_party/FLAC/src/format.c
similarity index 100%
rename from third_party/flac/src/format.c
rename to third_party/FLAC/src/format.c
diff --git a/third_party/flac/src/ia32/cpu_asm.nasm b/third_party/FLAC/src/ia32/cpu_asm.nasm
similarity index 100%
rename from third_party/flac/src/ia32/cpu_asm.nasm
rename to third_party/FLAC/src/ia32/cpu_asm.nasm
diff --git a/third_party/flac/src/ia32/fixed_asm.nasm b/third_party/FLAC/src/ia32/fixed_asm.nasm
similarity index 100%
rename from third_party/flac/src/ia32/fixed_asm.nasm
rename to third_party/FLAC/src/ia32/fixed_asm.nasm
diff --git a/third_party/flac/src/ia32/lpc_asm.nasm b/third_party/FLAC/src/ia32/lpc_asm.nasm
similarity index 100%
rename from third_party/flac/src/ia32/lpc_asm.nasm
rename to third_party/FLAC/src/ia32/lpc_asm.nasm
diff --git a/third_party/flac/src/ia32/nasm.h b/third_party/FLAC/src/ia32/nasm.h
similarity index 100%
rename from third_party/flac/src/ia32/nasm.h
rename to third_party/FLAC/src/ia32/nasm.h
diff --git a/third_party/flac/src/include/private/all.h b/third_party/FLAC/src/include/private/all.h
similarity index 100%
rename from third_party/flac/src/include/private/all.h
rename to third_party/FLAC/src/include/private/all.h
diff --git a/third_party/flac/src/include/private/bitmath.h b/third_party/FLAC/src/include/private/bitmath.h
similarity index 100%
rename from third_party/flac/src/include/private/bitmath.h
rename to third_party/FLAC/src/include/private/bitmath.h
diff --git a/third_party/flac/src/include/private/bitreader.h b/third_party/FLAC/src/include/private/bitreader.h
similarity index 100%
rename from third_party/flac/src/include/private/bitreader.h
rename to third_party/FLAC/src/include/private/bitreader.h
diff --git a/third_party/flac/src/include/private/bitwriter.h b/third_party/FLAC/src/include/private/bitwriter.h
similarity index 100%
rename from third_party/flac/src/include/private/bitwriter.h
rename to third_party/FLAC/src/include/private/bitwriter.h
diff --git a/third_party/flac/src/include/private/cpu.h b/third_party/FLAC/src/include/private/cpu.h
similarity index 100%
rename from third_party/flac/src/include/private/cpu.h
rename to third_party/FLAC/src/include/private/cpu.h
diff --git a/third_party/flac/src/include/private/crc.h b/third_party/FLAC/src/include/private/crc.h
similarity index 100%
rename from third_party/flac/src/include/private/crc.h
rename to third_party/FLAC/src/include/private/crc.h
diff --git a/third_party/flac/src/include/private/fixed.h b/third_party/FLAC/src/include/private/fixed.h
similarity index 100%
rename from third_party/flac/src/include/private/fixed.h
rename to third_party/FLAC/src/include/private/fixed.h
diff --git a/third_party/flac/src/include/private/float.h b/third_party/FLAC/src/include/private/float.h
similarity index 100%
rename from third_party/flac/src/include/private/float.h
rename to third_party/FLAC/src/include/private/float.h
diff --git a/third_party/flac/src/include/private/format.h b/third_party/FLAC/src/include/private/format.h
similarity index 100%
rename from third_party/flac/src/include/private/format.h
rename to third_party/FLAC/src/include/private/format.h
diff --git a/third_party/flac/src/include/private/lpc.h b/third_party/FLAC/src/include/private/lpc.h
similarity index 100%
rename from third_party/flac/src/include/private/lpc.h
rename to third_party/FLAC/src/include/private/lpc.h
diff --git a/third_party/flac/src/include/private/macros.h b/third_party/FLAC/src/include/private/macros.h
similarity index 100%
rename from third_party/flac/src/include/private/macros.h
rename to third_party/FLAC/src/include/private/macros.h
diff --git a/third_party/flac/src/include/private/md5.h b/third_party/FLAC/src/include/private/md5.h
similarity index 100%
rename from third_party/flac/src/include/private/md5.h
rename to third_party/FLAC/src/include/private/md5.h
diff --git a/third_party/flac/src/include/private/memory.h b/third_party/FLAC/src/include/private/memory.h
similarity index 100%
rename from third_party/flac/src/include/private/memory.h
rename to third_party/FLAC/src/include/private/memory.h
diff --git a/third_party/flac/src/include/private/metadata.h b/third_party/FLAC/src/include/private/metadata.h
similarity index 100%
rename from third_party/flac/src/include/private/metadata.h
rename to third_party/FLAC/src/include/private/metadata.h
diff --git a/third_party/flac/src/include/private/ogg_decoder_aspect.h b/third_party/FLAC/src/include/private/ogg_decoder_aspect.h
similarity index 100%
rename from third_party/flac/src/include/private/ogg_decoder_aspect.h
rename to third_party/FLAC/src/include/private/ogg_decoder_aspect.h
diff --git a/third_party/flac/src/include/private/ogg_encoder_aspect.h b/third_party/FLAC/src/include/private/ogg_encoder_aspect.h
similarity index 100%
rename from third_party/flac/src/include/private/ogg_encoder_aspect.h
rename to third_party/FLAC/src/include/private/ogg_encoder_aspect.h
diff --git a/third_party/flac/src/include/private/ogg_helper.h b/third_party/FLAC/src/include/private/ogg_helper.h
similarity index 100%
rename from third_party/flac/src/include/private/ogg_helper.h
rename to third_party/FLAC/src/include/private/ogg_helper.h
diff --git a/third_party/flac/src/include/private/ogg_mapping.h b/third_party/FLAC/src/include/private/ogg_mapping.h
similarity index 100%
rename from third_party/flac/src/include/private/ogg_mapping.h
rename to third_party/FLAC/src/include/private/ogg_mapping.h
diff --git a/third_party/flac/src/include/private/stream_encoder.h b/third_party/FLAC/src/include/private/stream_encoder.h
similarity index 100%
rename from third_party/flac/src/include/private/stream_encoder.h
rename to third_party/FLAC/src/include/private/stream_encoder.h
diff --git a/third_party/flac/src/include/private/stream_encoder_framing.h b/third_party/FLAC/src/include/private/stream_encoder_framing.h
similarity index 100%
rename from third_party/flac/src/include/private/stream_encoder_framing.h
rename to third_party/FLAC/src/include/private/stream_encoder_framing.h
diff --git a/third_party/flac/src/include/private/window.h b/third_party/FLAC/src/include/private/window.h
similarity index 100%
rename from third_party/flac/src/include/private/window.h
rename to third_party/FLAC/src/include/private/window.h
diff --git a/third_party/flac/src/include/protected/all.h b/third_party/FLAC/src/include/protected/all.h
similarity index 100%
rename from third_party/flac/src/include/protected/all.h
rename to third_party/FLAC/src/include/protected/all.h
diff --git a/third_party/flac/src/include/protected/stream_decoder.h b/third_party/FLAC/src/include/protected/stream_decoder.h
similarity index 100%
rename from third_party/flac/src/include/protected/stream_decoder.h
rename to third_party/FLAC/src/include/protected/stream_decoder.h
diff --git a/third_party/flac/src/include/protected/stream_encoder.h b/third_party/FLAC/src/include/protected/stream_encoder.h
similarity index 100%
rename from third_party/flac/src/include/protected/stream_encoder.h
rename to third_party/FLAC/src/include/protected/stream_encoder.h
diff --git a/third_party/flac/src/include/share/alloc.h b/third_party/FLAC/src/include/share/alloc.h
similarity index 100%
rename from third_party/flac/src/include/share/alloc.h
rename to third_party/FLAC/src/include/share/alloc.h
diff --git a/third_party/flac/src/include/share/compat.h b/third_party/FLAC/src/include/share/compat.h
similarity index 100%
rename from third_party/flac/src/include/share/compat.h
rename to third_party/FLAC/src/include/share/compat.h
diff --git a/third_party/flac/src/include/share/endswap.h b/third_party/FLAC/src/include/share/endswap.h
similarity index 100%
rename from third_party/flac/src/include/share/endswap.h
rename to third_party/FLAC/src/include/share/endswap.h
diff --git a/third_party/flac/src/include/share/getopt.h b/third_party/FLAC/src/include/share/getopt.h
similarity index 100%
rename from third_party/flac/src/include/share/getopt.h
rename to third_party/FLAC/src/include/share/getopt.h
diff --git a/third_party/flac/src/include/share/macros.h b/third_party/FLAC/src/include/share/macros.h
similarity index 100%
rename from third_party/flac/src/include/share/macros.h
rename to third_party/FLAC/src/include/share/macros.h
diff --git a/third_party/flac/src/include/share/private.h b/third_party/FLAC/src/include/share/private.h
similarity index 100%
rename from third_party/flac/src/include/share/private.h
rename to third_party/FLAC/src/include/share/private.h
diff --git a/third_party/flac/src/include/share/safe_str.h b/third_party/FLAC/src/include/share/safe_str.h
similarity index 100%
rename from third_party/flac/src/include/share/safe_str.h
rename to third_party/FLAC/src/include/share/safe_str.h
diff --git a/third_party/flac/src/include/share/utf8.h b/third_party/FLAC/src/include/share/utf8.h
similarity index 100%
rename from third_party/flac/src/include/share/utf8.h
rename to third_party/FLAC/src/include/share/utf8.h
diff --git a/third_party/flac/src/include/share/win_utf8_io.h b/third_party/FLAC/src/include/share/win_utf8_io.h
similarity index 100%
rename from third_party/flac/src/include/share/win_utf8_io.h
rename to third_party/FLAC/src/include/share/win_utf8_io.h
diff --git a/third_party/flac/src/lpc.c b/third_party/FLAC/src/lpc.c
similarity index 100%
rename from third_party/flac/src/lpc.c
rename to third_party/FLAC/src/lpc.c
diff --git a/third_party/flac/src/lpc_intrin_avx2.c b/third_party/FLAC/src/lpc_intrin_avx2.c
similarity index 100%
rename from third_party/flac/src/lpc_intrin_avx2.c
rename to third_party/FLAC/src/lpc_intrin_avx2.c
diff --git a/third_party/flac/src/lpc_intrin_sse.c b/third_party/FLAC/src/lpc_intrin_sse.c
similarity index 100%
rename from third_party/flac/src/lpc_intrin_sse.c
rename to third_party/FLAC/src/lpc_intrin_sse.c
diff --git a/third_party/flac/src/lpc_intrin_sse2.c b/third_party/FLAC/src/lpc_intrin_sse2.c
similarity index 100%
rename from third_party/flac/src/lpc_intrin_sse2.c
rename to third_party/FLAC/src/lpc_intrin_sse2.c
diff --git a/third_party/flac/src/lpc_intrin_sse41.c b/third_party/FLAC/src/lpc_intrin_sse41.c
similarity index 100%
rename from third_party/flac/src/lpc_intrin_sse41.c
rename to third_party/FLAC/src/lpc_intrin_sse41.c
diff --git a/third_party/flac/src/md5.c b/third_party/FLAC/src/md5.c
similarity index 100%
rename from third_party/flac/src/md5.c
rename to third_party/FLAC/src/md5.c
diff --git a/third_party/flac/src/memory.c b/third_party/FLAC/src/memory.c
similarity index 100%
rename from third_party/flac/src/memory.c
rename to third_party/FLAC/src/memory.c
diff --git a/third_party/flac/src/metadata_iterators.c b/third_party/FLAC/src/metadata_iterators.c
similarity index 100%
rename from third_party/flac/src/metadata_iterators.c
rename to third_party/FLAC/src/metadata_iterators.c
diff --git a/third_party/flac/src/metadata_object.c b/third_party/FLAC/src/metadata_object.c
similarity index 100%
rename from third_party/flac/src/metadata_object.c
rename to third_party/FLAC/src/metadata_object.c
diff --git a/third_party/flac/src/ogg_decoder_aspect.c b/third_party/FLAC/src/ogg_decoder_aspect.c
similarity index 100%
rename from third_party/flac/src/ogg_decoder_aspect.c
rename to third_party/FLAC/src/ogg_decoder_aspect.c
diff --git a/third_party/flac/src/ogg_encoder_aspect.c b/third_party/FLAC/src/ogg_encoder_aspect.c
similarity index 100%
rename from third_party/flac/src/ogg_encoder_aspect.c
rename to third_party/FLAC/src/ogg_encoder_aspect.c
diff --git a/third_party/flac/src/ogg_helper.c b/third_party/FLAC/src/ogg_helper.c
similarity index 100%
rename from third_party/flac/src/ogg_helper.c
rename to third_party/FLAC/src/ogg_helper.c
diff --git a/third_party/flac/src/ogg_mapping.c b/third_party/FLAC/src/ogg_mapping.c
similarity index 100%
rename from third_party/flac/src/ogg_mapping.c
rename to third_party/FLAC/src/ogg_mapping.c
diff --git a/third_party/flac/src/stream_decoder.c b/third_party/FLAC/src/stream_decoder.c
similarity index 100%
rename from third_party/flac/src/stream_decoder.c
rename to third_party/FLAC/src/stream_decoder.c
diff --git a/third_party/flac/src/stream_encoder.c b/third_party/FLAC/src/stream_encoder.c
similarity index 100%
rename from third_party/flac/src/stream_encoder.c
rename to third_party/FLAC/src/stream_encoder.c
diff --git a/third_party/flac/src/stream_encoder_framing.c b/third_party/FLAC/src/stream_encoder_framing.c
similarity index 100%
rename from third_party/flac/src/stream_encoder_framing.c
rename to third_party/FLAC/src/stream_encoder_framing.c
diff --git a/third_party/flac/src/stream_encoder_intrin_avx2.c b/third_party/FLAC/src/stream_encoder_intrin_avx2.c
similarity index 100%
rename from third_party/flac/src/stream_encoder_intrin_avx2.c
rename to third_party/FLAC/src/stream_encoder_intrin_avx2.c
diff --git a/third_party/flac/src/stream_encoder_intrin_sse2.c b/third_party/FLAC/src/stream_encoder_intrin_sse2.c
similarity index 100%
rename from third_party/flac/src/stream_encoder_intrin_sse2.c
rename to third_party/FLAC/src/stream_encoder_intrin_sse2.c
diff --git a/third_party/flac/src/stream_encoder_intrin_ssse3.c b/third_party/FLAC/src/stream_encoder_intrin_ssse3.c
similarity index 100%
rename from third_party/flac/src/stream_encoder_intrin_ssse3.c
rename to third_party/FLAC/src/stream_encoder_intrin_ssse3.c
diff --git a/third_party/flac/src/win_utf8_io.c b/third_party/FLAC/src/win_utf8_io.c
similarity index 100%
rename from third_party/flac/src/win_utf8_io.c
rename to third_party/FLAC/src/win_utf8_io.c
diff --git a/third_party/flac/src/window.c b/third_party/FLAC/src/window.c
similarity index 100%
rename from third_party/flac/src/window.c
rename to third_party/FLAC/src/window.c
diff --git a/third_party/flac/stream_decoder.h b/third_party/FLAC/stream_decoder.h
similarity index 100%
rename from third_party/flac/stream_decoder.h
rename to third_party/FLAC/stream_decoder.h
diff --git a/third_party/flac/stream_encoder.h b/third_party/FLAC/stream_encoder.h
similarity index 100%
rename from third_party/flac/stream_encoder.h
rename to third_party/FLAC/stream_encoder.h
diff --git a/third_party/libmodplug/src/snd_dsp.cpp b/third_party/libmodplug/src/snd_dsp.cpp
index 29d142e..3a2969d 100644
--- a/third_party/libmodplug/src/snd_dsp.cpp
+++ b/third_party/libmodplug/src/snd_dsp.cpp
@@ -98,12 +98,10 @@ static LONG DolbyLoFilterDelay[XBASSBUFFERSIZE];
 static LONG DolbyHiFilterBuffer[FILTERBUFFERSIZE];
 static LONG SurroundBuffer[SURROUNDBUFFERSIZE];
 
-/*
 // Access the main temporary mix buffer directly: avoids an extra pointer
-extern int MixSoundBuffer[MIXBUFFERSIZE*2];
+extern int MixSoundBuffer[MIXBUFFERSIZE*4];
 //cextern int MixReverbBuffer[MIXBUFFERSIZE*2];
 extern int MixReverbBuffer[MIXBUFFERSIZE*2];
-*/
 
 static UINT GetMaskFromSize(UINT len)
 //-----------------------------------
diff --git a/third_party/libogg/include/ogg/os_types.h b/third_party/libogg/include/ogg/os_types.h
index 8bf8210..ac72e33 100644
--- a/third_party/libogg/include/ogg/os_types.h
+++ b/third_party/libogg/include/ogg/os_types.h
@@ -140,7 +140,12 @@
 
 #else
 
-#  include <ogg/config_types.h>
+#include <inttypes.h>
+typedef int16_t ogg_int16_t;
+typedef uint16_t ogg_uint16_t;
+typedef int32_t ogg_int32_t;
+typedef uint32_t ogg_uint32_t;
+typedef int64_t ogg_int64_t;
 
 #endif
 
diff --git a/third_party/wavpack/include/Makefile.am b/third_party/wavpack/include/Makefile.am
new file mode 100644
index 0000000..4284506
--- /dev/null
+++ b/third_party/wavpack/include/Makefile.am
@@ -0,0 +1,5 @@
+wpinclude_HEADERS = wavpack.h
+wpincludedir = $(prefix)/include/wavpack
+
+MAINTAINERCLEANFILES = \
+	Makefile.in
diff --git a/third_party/wavpack/include/wavpack.h b/third_party/wavpack/include/wavpack.h
index 885c52f..374924b 100644
--- a/third_party/wavpack/include/wavpack.h
+++ b/third_party/wavpack/include/wavpack.h
@@ -1,7 +1,7 @@
 ////////////////////////////////////////////////////////////////////////////
 //                           **** WAVPACK ****                            //
 //                  Hybrid Lossless Wavefile Compressor                   //
-//              Copyright (c) 1998 - 2006 Conifer Software.               //
+//                Copyright (c) 1998 - 2016 David Bryant.                 //
 //                          All Rights Reserved.                          //
 //      Distributed under the BSD Software License (see license.txt)      //
 ////////////////////////////////////////////////////////////////////////////
@@ -16,10 +16,17 @@
 
 #include <sys/types.h>
 
-#if defined(_WIN32) && !defined(__MINGW32__)
-#include <stdint.h>
+#if defined(_MSC_VER) && _MSC_VER < 1600
+typedef unsigned __int64 uint64_t;
+typedef unsigned __int32 uint32_t;
+typedef unsigned __int16 uint16_t;
+typedef unsigned __int8 uint8_t;
+typedef __int64 int64_t;
+typedef __int32 int32_t;
+typedef __int16 int16_t;
+typedef __int8  int8_t;
 #else
-#include <inttypes.h>
+#include <stdint.h>
 #endif
 
 // RIFF / wav header formats (these occur at the beginning of both wav files
@@ -41,12 +48,12 @@ typedef struct {
 #define ChunkHeaderFormat "4L"
 
 typedef struct {
-    unsigned short FormatTag, NumChannels;
+    uint16_t FormatTag, NumChannels;
     uint32_t SampleRate, BytesPerSecond;
-    unsigned short BlockAlign, BitsPerSample;
-    unsigned short cbSize, ValidBitsPerSample;
+    uint16_t BlockAlign, BitsPerSample;
+    uint16_t cbSize, ValidBitsPerSample;
     int32_t ChannelMask;
-    unsigned short SubFormat;
+    uint16_t SubFormat;
     char GUID [14];
 } WaveHeader;
 
@@ -62,13 +69,43 @@ typedef struct {
 typedef struct {
     char ckID [4];
     uint32_t ckSize;
-    short version;
-    unsigned char track_no, index_no;
+    int16_t version;
+    unsigned char block_index_u8;
+    unsigned char total_samples_u8;
     uint32_t total_samples, block_index, block_samples, flags, crc;
 } WavpackHeader;
 
 #define WavpackHeaderFormat "4LS2LLLLL"
 
+// Macros to access the 40-bit block_index field
+
+#define GET_BLOCK_INDEX(hdr) ( (int64_t) (hdr).block_index + ((int64_t) (hdr).block_index_u8 << 32) )
+
+#define SET_BLOCK_INDEX(hdr,value) do { \
+    int64_t tmp = (value);              \
+    (hdr).block_index = (uint32_t) tmp; \
+    (hdr).block_index_u8 =              \
+        (unsigned char) (tmp >> 32);    \
+} while (0)
+
+// Macros to access the 40-bit total_samples field, which is complicated by the fact that
+// all 1's in the lower 32 bits indicates "unknown" (regardless of upper 8 bits)
+
+#define GET_TOTAL_SAMPLES(hdr) ( ((hdr).total_samples == (uint32_t) -1) ? -1 : \
+    (int64_t) (hdr).total_samples + ((int64_t) (hdr).total_samples_u8 << 32) - (hdr).total_samples_u8 )
+
+#define SET_TOTAL_SAMPLES(hdr,value) do {       \
+    int64_t tmp = (value);                      \
+    if (tmp < 0)                                \
+        (hdr).total_samples = (uint32_t) -1;    \
+    else {                                      \
+        tmp += (tmp / 0xffffffffLL);            \
+        (hdr).total_samples = (uint32_t) tmp;   \
+        (hdr).total_samples_u8 =                \
+            (unsigned char) (tmp >> 32);        \
+    }                                           \
+} while (0)
+
 // or-values for WavpackHeader.flags
 #define BYTES_STORED    3       // 1-4 bytes/sample
 #define MONO_FLAG       4       // not stereo
@@ -95,17 +132,19 @@ typedef struct {
 #define SRATE_MASK      (0xfL << SRATE_LSB)
 
 #define FALSE_STEREO    0x40000000      // block is stereo, but data is mono
-
-#define IGNORED_FLAGS   0x18000000      // reserved, but ignore if encountered
 #define NEW_SHAPING     0x20000000      // use IIR filter for negative shaping
-#define UNKNOWN_FLAGS   0x80000000      // also reserved, but refuse decode if
-                                        //  encountered
 
 #define MONO_DATA (MONO_FLAG | FALSE_STEREO)
 
+// Introduced in WavPack 5.0:
+#define HAS_CHECKSUM    0x10000000      // block contains a trailing checksum
+#define DSD_FLAG        0x80000000      // block is encoded DSD (1-bit PCM)
+
+#define IGNORED_FLAGS   0x08000000      // reserved, but ignore if encountered
+#define UNKNOWN_FLAGS   0x00000000      // we no longer have any of these spares
+
 #define MIN_STREAM_VERS     0x402       // lowest stream version we'll decode
 #define MAX_STREAM_VERS     0x410       // highest stream version we'll decode or encode
-#define CUR_STREAM_VERS     0x407       // stream version we are writing now
 
 // These are the mask bit definitions for the metadata chunk id byte (see format.txt)
 
@@ -131,11 +170,15 @@ typedef struct {
 
 #define ID_RIFF_HEADER          (ID_OPTIONAL_DATA | 0x1)
 #define ID_RIFF_TRAILER         (ID_OPTIONAL_DATA | 0x2)
-#define ID_REPLAY_GAIN          (ID_OPTIONAL_DATA | 0x3)    // never used (APEv2)
-#define ID_CUESHEET             (ID_OPTIONAL_DATA | 0x4)    // never used (APEv2)
+#define ID_ALT_HEADER           (ID_OPTIONAL_DATA | 0x3)
+#define ID_ALT_TRAILER          (ID_OPTIONAL_DATA | 0x4)
 #define ID_CONFIG_BLOCK         (ID_OPTIONAL_DATA | 0x5)
 #define ID_MD5_CHECKSUM         (ID_OPTIONAL_DATA | 0x6)
 #define ID_SAMPLE_RATE          (ID_OPTIONAL_DATA | 0x7)
+#define ID_ALT_EXTENSION        (ID_OPTIONAL_DATA | 0x8)
+#define ID_ALT_MD5_CHECKSUM     (ID_OPTIONAL_DATA | 0x9)
+#define ID_NEW_CONFIG_BLOCK     (ID_OPTIONAL_DATA | 0xa)
+#define ID_BLOCK_CHECKSUM       (ID_OPTIONAL_DATA | 0xf)
 
 ///////////////////////// WavPack Configuration ///////////////////////////////
 
@@ -149,12 +192,13 @@ typedef struct {
     int qmode, flags, xmode, num_channels, float_norm_exp;
     int32_t block_samples, extra_flags, sample_rate, channel_mask;
     unsigned char md5_checksum [16], md5_read;
-    int num_tag_strings;
-    char **tag_strings;
+    int num_tag_strings;                // this field is not used
+    char **tag_strings;                 // this field is not used
 } WavpackConfig;
 
 #define CONFIG_HYBRID_FLAG      8       // hybrid mode
 #define CONFIG_JOINT_STEREO     0x10    // joint stereo
+#define CONFIG_CROSS_DECORR     0x20    // no-delay cross decorrelation
 #define CONFIG_HYBRID_SHAPE     0x40    // noise shape (hybrid mode only)
 #define CONFIG_FAST_FLAG        0x200   // fast mode
 #define CONFIG_HIGH_FLAG        0x800   // high quality mode
@@ -166,6 +210,7 @@ typedef struct {
 #define CONFIG_CREATE_EXE       0x40000 // create executable
 #define CONFIG_CREATE_WVC       0x80000 // create correction file
 #define CONFIG_OPTIMIZE_WVC     0x100000 // maximize bybrid compression
+#define CONFIG_COMPATIBLE_WRITE 0x400000 // write files for decoders < 4.3
 #define CONFIG_CALC_NOISE       0x800000 // calc noise in hybrid mode
 #define CONFIG_EXTRA_MODE       0x2000000 // extra processing mode
 #define CONFIG_SKIP_WVX         0x4000000 // no wvx stream w/ floats & big ints
@@ -174,6 +219,32 @@ typedef struct {
 #define CONFIG_PAIR_UNDEF_CHANS 0x20000000 // encode undefined channels in stereo pairs
 #define CONFIG_OPTIMIZE_MONO    0x80000000 // optimize for mono streams posing as stereo
 
+// The lower 8 bits of qmode indicate the use of new features in version 5 that (presently)
+// only apply to Core Audio Files (CAF) and DSD files, but could apply to other things too.
+// These flags are stored in the file and can be retrieved by a decoder that is aware of
+// them, but the individual bits are meaningless to the library. If ANY of these bits are
+// set then the MD5 sum is written with a new ID so that old decoders will not see it
+// (because these features will cause the MD5 sum to be different and fail).
+
+#define QMODE_BIG_ENDIAN        0x1     // big-endian data format (opposite of WAV format)
+#define QMODE_SIGNED_BYTES      0x2     // 8-bit audio data is signed (opposite of WAV format)
+#define QMODE_UNSIGNED_WORDS    0x4     // audio data (other than 8-bit) is unsigned (opposite of WAV format)
+#define QMODE_REORDERED_CHANS   0x8     // source channels were not Microsoft order, so they were reordered
+#define QMODE_DSD_LSB_FIRST     0x10    // DSD bytes, LSB first (most Sony .dsf files)
+#define QMODE_DSD_MSB_FIRST     0x20    // DSD bytes, MSB first (Philips .dff files)
+#define QMODE_DSD_IN_BLOCKS     0x40    // DSD data is blocked by channels (Sony .dsf only)
+#define QMODE_DSD_AUDIO         (QMODE_DSD_LSB_FIRST | QMODE_DSD_MSB_FIRST)
+
+// The rest of the qmode word is reserved for the private use of the command-line programs
+// and are ignored by the library (and not stored either). They really should not be defined
+// here, but I thought it would be a good idea to have all the definitions together.
+
+#define QMODE_ADOBE_MODE        0x100   // user specified Adobe mode
+#define QMODE_NO_STORE_WRAPPER  0x200   // user specified to not store audio file wrapper (RIFF, CAFF, etc.)
+#define QMODE_CHANS_UNASSIGNED  0x400   // user specified "..." in --channel-order option
+#define QMODE_IGNORE_LENGTH     0x800   // user specified to ignore length in file header
+#define QMODE_RAW_PCM           0x1000  // user specified raw PCM format (no header present)
+
 ////////////// Callbacks used for reading & writing WavPack streams //////////
 
 typedef struct {
@@ -189,18 +260,40 @@ typedef struct {
     int32_t (*write_bytes)(void *id, void *data, int32_t bcount);
 } WavpackStreamReader;
 
+// Extended version of structure for handling large files and added
+// functionality for truncating and closing files
+
+typedef struct {
+    int32_t (*read_bytes)(void *id, void *data, int32_t bcount);
+    int32_t (*write_bytes)(void *id, void *data, int32_t bcount);
+    int64_t (*get_pos)(void *id);                               // new signature for large files
+    int (*set_pos_abs)(void *id, int64_t pos);                  // new signature for large files
+    int (*set_pos_rel)(void *id, int64_t delta, int mode);      // new signature for large files
+    int (*push_back_byte)(void *id, int c);
+    int64_t (*get_length)(void *id);                            // new signature for large files
+    int (*can_seek)(void *id);
+    int (*truncate_here)(void *id);                             // new function to truncate file at current position
+    int (*close)(void *id);                                     // new function to close file
+} WavpackStreamReader64;
+
 typedef int (*WavpackBlockOutput)(void *id, void *data, int32_t bcount);
 
 //////////////////////////// function prototypes /////////////////////////////
 
-// Note: See wputils.c sourcecode for descriptions for using these functions.
-
 typedef void WavpackContext;
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+#define MAX_WAVPACK_SAMPLES ((1LL << 40) - 257)
+
+WavpackContext *WavpackOpenRawDecoder (
+    void *main_data, int32_t main_size,
+    void *corr_data, int32_t corr_size,
+    int16_t version, char *error, int flags, int norm_offset);
+
+WavpackContext *WavpackOpenFileInputEx64 (WavpackStreamReader64 *reader, void *wv_id, void *wvc_id, char *error, int flags, int norm_offset);
 WavpackContext *WavpackOpenFileInputEx (WavpackStreamReader *reader, void *wv_id, void *wvc_id, char *error, int flags, int norm_offset);
 WavpackContext *WavpackOpenFileInput (const char *infilename, char *error, int flags, int norm_offset);
 
@@ -212,6 +305,16 @@ WavpackContext *WavpackOpenFileInput (const char *infilename, char *error, int f
 #define OPEN_STREAMING  0x20    // "streaming" mode blindly unpacks blocks
                                 // w/o regard to header file position info
 #define OPEN_EDIT_TAGS  0x40    // allow editing of tags
+#define OPEN_FILE_UTF8  0x80    // assume filenames are UTF-8 encoded, not ANSI (Windows only)
+
+// new for version 5
+
+#define OPEN_DSD_NATIVE 0x100   // open DSD files as bitstreams
+                                // (returned as 8-bit "samples" stored in 32-bit words)
+#define OPEN_DSD_AS_PCM 0x200   // open DSD files as 24-bit PCM (decimated 8x)
+#define OPEN_ALT_TYPES  0x400   // application is aware of alternate file types & qmode
+                                // (just affects retrieving wrappers & MD5 checksums)
+#define OPEN_NO_CHECKSUM 0x800  // don't verify block checksums before decoding
 
 int WavpackGetMode (WavpackContext *wpc);
 
@@ -230,16 +333,25 @@ int WavpackGetMode (WavpackContext *wpc);
 #define MODE_XMODE      0x7000  // mask for extra level (1-6, 0=unknown)
 #define MODE_DNS        0x8000
 
+int WavpackVerifySingleBlock (unsigned char *buffer, int verify_checksum);
+int WavpackGetQualifyMode (WavpackContext *wpc);
 char *WavpackGetErrorMessage (WavpackContext *wpc);
 int WavpackGetVersion (WavpackContext *wpc);
+char *WavpackGetFileExtension (WavpackContext *wpc);
+unsigned char WavpackGetFileFormat (WavpackContext *wpc);
 uint32_t WavpackUnpackSamples (WavpackContext *wpc, int32_t *buffer, uint32_t samples);
 uint32_t WavpackGetNumSamples (WavpackContext *wpc);
+int64_t WavpackGetNumSamples64 (WavpackContext *wpc);
+uint32_t WavpackGetNumSamplesInFrame (WavpackContext *wpc);
 uint32_t WavpackGetSampleIndex (WavpackContext *wpc);
+int64_t WavpackGetSampleIndex64 (WavpackContext *wpc);
 int WavpackGetNumErrors (WavpackContext *wpc);
 int WavpackLossyBlocks (WavpackContext *wpc);
 int WavpackSeekSample (WavpackContext *wpc, uint32_t sample);
+int WavpackSeekSample64 (WavpackContext *wpc, int64_t sample);
 WavpackContext *WavpackCloseFile (WavpackContext *wpc);
 uint32_t WavpackGetSampleRate (WavpackContext *wpc);
+uint32_t WavpackGetNativeSampleRate (WavpackContext *wpc);
 int WavpackGetBitsPerSample (WavpackContext *wpc);
 int WavpackGetBytesPerSample (WavpackContext *wpc);
 int WavpackGetNumChannels (WavpackContext *wpc);
@@ -247,12 +359,15 @@ int WavpackGetChannelMask (WavpackContext *wpc);
 int WavpackGetReducedChannels (WavpackContext *wpc);
 int WavpackGetFloatNormExp (WavpackContext *wpc);
 int WavpackGetMD5Sum (WavpackContext *wpc, unsigned char data [16]);
+void WavpackGetChannelIdentities (WavpackContext *wpc, unsigned char *identities);
+uint32_t WavpackGetChannelLayout (WavpackContext *wpc, unsigned char *reorder);
 uint32_t WavpackGetWrapperBytes (WavpackContext *wpc);
 unsigned char *WavpackGetWrapperData (WavpackContext *wpc);
 void WavpackFreeWrapper (WavpackContext *wpc);
 void WavpackSeekTrailingWrapper (WavpackContext *wpc);
 double WavpackGetProgress (WavpackContext *wpc);
 uint32_t WavpackGetFileSize (WavpackContext *wpc);
+int64_t WavpackGetFileSize64 (WavpackContext *wpc);
 double WavpackGetRatio (WavpackContext *wpc);
 double WavpackGetAverageBitrate (WavpackContext *wpc, int count_wvc);
 double WavpackGetInstantBitrate (WavpackContext *wpc);
@@ -268,7 +383,17 @@ int WavpackDeleteTagItem (WavpackContext *wpc, const char *item);
 int WavpackWriteTag (WavpackContext *wpc);
 
 WavpackContext *WavpackOpenFileOutput (WavpackBlockOutput blockout, void *wv_id, void *wvc_id);
+void WavpackSetFileInformation (WavpackContext *wpc, char *file_extension, unsigned char file_format);
+
+#define WP_FORMAT_WAV   0       // Microsoft RIFF, including BWF and RF64 varients
+#define WP_FORMAT_W64   1       // Sony Wave64
+#define WP_FORMAT_CAF   2       // Apple CoreAudio
+#define WP_FORMAT_DFF   3       // Philips DSDIFF
+#define WP_FORMAT_DSF   4       // Sony DSD Format
+
 int WavpackSetConfiguration (WavpackContext *wpc, WavpackConfig *config, uint32_t total_samples);
+int WavpackSetConfiguration64 (WavpackContext *wpc, WavpackConfig *config, int64_t total_samples, const unsigned char *chan_ids);
+int WavpackSetChannelLayout (WavpackContext *wpc, uint32_t layout_tag, const unsigned char *reorder);
 int WavpackAddWrapper (WavpackContext *wpc, void *data, uint32_t bcount);
 int WavpackStoreMD5Sum (WavpackContext *wpc, unsigned char data [16]);
 int WavpackPackInit (WavpackContext *wpc);
@@ -282,6 +407,8 @@ void WavpackFloatNormalize (int32_t *values, int32_t num_values, int delta_exp);
 
 void WavpackLittleEndianToNative (void *data, char *format);
 void WavpackNativeToLittleEndian (void *data, char *format);
+void WavpackBigEndianToNative (void *data, char *format);
+void WavpackNativeToBigEndian (void *data, char *format);
 
 uint32_t WavpackGetLibraryVersion (void);
 const char *WavpackGetLibraryVersionString (void);
diff --git a/third_party/wavpack/src/bits.c b/third_party/wavpack/src/bits.c
deleted file mode 100644
index 2eab38b..0000000
--- a/third_party/wavpack/src/bits.c
+++ /dev/null
@@ -1,274 +0,0 @@
-////////////////////////////////////////////////////////////////////////////
-//                           **** WAVPACK ****                            //
-//                  Hybrid Lossless Wavefile Compressor                   //
-//              Copyright (c) 1998 - 2006 Conifer Software.               //
-//                          All Rights Reserved.                          //
-//      Distributed under the BSD Software License (see license.txt)      //
-////////////////////////////////////////////////////////////////////////////
-
-// bits.c
-
-// This module provides utilities to support the BitStream structure which is
-// used to read and write all WavPack audio data streams. It also contains a
-// wrapper for the stream I/O functions and a set of functions dealing with
-// endian-ness, both for enhancing portability. Finally, a debug wrapper for
-// the malloc() system is provided.
-
-#include "wavpack_local.h"
-
-#include <stdlib.h>
-#include <string.h>
-#include <ctype.h>
-#include <sys/stat.h>
-
-#if defined(WIN32)
-#include <io.h>
-#else
-#if defined(__OS2__)
-#include <io.h>
-#endif
-#include <unistd.h>
-#endif
-
-////////////////////////// Bitstream functions ////////////////////////////////
-
-#if !defined(NO_UNPACK) || defined(INFO_ONLY)
-
-// Open the specified BitStream and associate with the specified buffer.
-
-static void bs_read (Bitstream *bs);
-
-void bs_open_read (Bitstream *bs, void *buffer_start, void *buffer_end)
-{
-    bs->error = bs->sr = bs->bc = 0;
-    bs->ptr = (bs->buf = buffer_start) - 1;
-    bs->end = buffer_end;
-    bs->wrap = bs_read;
-}
-
-// This function is only called from the getbit() and getbits() macros when
-// the BitStream has been exhausted and more data is required. Sinve these
-// bistreams no longer access files, this function simple sets an error and
-// resets the buffer.
-
-static void bs_read (Bitstream *bs)
-{
-    bs->ptr = bs->buf - 1;
-    bs->error = 1;
-}
-
-// This function is called to close the bitstream. It returns the number of
-// full bytes actually read as bits.
-
-uint32_t bs_close_read (Bitstream *bs)
-{
-    uint32_t bytes_read;
-
-    if (bs->bc < sizeof (*(bs->ptr)) * 8)
-        bs->ptr++;
-
-    bytes_read = (uint32_t)(bs->ptr - bs->buf) * sizeof (*(bs->ptr));
-
-    if (!(bytes_read & 1))
-        ++bytes_read;
-
-    CLEAR (*bs);
-    return bytes_read;
-}
-
-#endif
-
-#ifndef NO_PACK
-
-// Open the specified BitStream using the specified buffer pointers. It is
-// assumed that enough buffer space has been allocated for all data that will
-// be written, otherwise an error will be generated.
-
-static void bs_write (Bitstream *bs);
-
-void bs_open_write (Bitstream *bs, void *buffer_start, void *buffer_end)
-{
-    bs->error = bs->sr = bs->bc = 0;
-    bs->ptr = bs->buf = buffer_start;
-    bs->end = buffer_end;
-    bs->wrap = bs_write;
-}
-
-// This function is only called from the putbit() and putbits() macros when
-// the buffer is full, which is now flagged as an error.
-
-static void bs_write (Bitstream *bs)
-{
-    bs->ptr = bs->buf;
-    bs->error = 1;
-}
-
-// This function forces a flushing write of the specified BitStream, and
-// returns the total number of bytes written into the buffer.
-
-uint32_t bs_close_write (Bitstream *bs)
-{
-    uint32_t bytes_written;
-
-    if (bs->error)
-        return (uint32_t) -1;
-
-    while (1) {
-        while (bs->bc)
-            putbit_1 (bs);
-
-        bytes_written = (uint32_t)(bs->ptr - bs->buf) * sizeof (*(bs->ptr));
-
-        if (bytes_written & 1) {
-            putbit_1 (bs);
-        }
-        else
-            break;
-    };
-
-    CLEAR (*bs);
-    return bytes_written;
-}
-
-#endif
-
-/////////////////////// Endian Correction Routines ////////////////////////////
-
-void little_endian_to_native (void *data, char *format)
-{
-    unsigned char *cp = (unsigned char *) data;
-    int32_t temp;
-
-    while (*format) {
-        switch (*format) {
-            case 'L':
-                temp = cp [0] + ((int32_t) cp [1] << 8) + ((int32_t) cp [2] << 16) + ((int32_t) cp [3] << 24);
-                * (int32_t *) cp = temp;
-                cp += 4;
-                break;
-
-            case 'S':
-                temp = cp [0] + (cp [1] << 8);
-                * (short *) cp = (short) temp;
-                cp += 2;
-                break;
-
-            default:
-                if (isdigit (*format))
-                    cp += *format - '0';
-
-                break;
-        }
-
-        format++;
-    }
-}
-
-void native_to_little_endian (void *data, char *format)
-{
-    unsigned char *cp = (unsigned char *) data;
-    int32_t temp;
-
-    while (*format) {
-        switch (*format) {
-            case 'L':
-                temp = * (int32_t *) cp;
-                *cp++ = (unsigned char) temp;
-                *cp++ = (unsigned char) (temp >> 8);
-                *cp++ = (unsigned char) (temp >> 16);
-                *cp++ = (unsigned char) (temp >> 24);
-                break;
-
-            case 'S':
-                temp = * (short *) cp;
-                *cp++ = (unsigned char) temp;
-                *cp++ = (unsigned char) (temp >> 8);
-                break;
-
-            default:
-                if (isdigit (*format))
-                    cp += *format - '0';
-
-                break;
-        }
-
-        format++;
-    }
-}
-
-////////////////////////// Debug Wrapper for Malloc ///////////////////////////
-
-#ifdef DEBUG_ALLOC
-
-void *vptrs [512];
-
-static void *add_ptr (void *ptr)
-{
-    int i;
-
-    for (i = 0; i < 512; ++i)
-        if (!vptrs [i]) {
-            vptrs [i] = ptr;
-            break;
-        }
-
-    if (i == 512)
-        error_line ("too many mallocs!");
-
-    return ptr;
-}
-
-static void *del_ptr (void *ptr)
-{
-    int i;
-
-    for (i = 0; i < 512; ++i)
-        if (vptrs [i] == ptr) {
-            vptrs [i] = NULL;
-            break;
-        }
-
-    if (i == 512)
-        error_line ("free invalid ptr!");
-
-    return ptr;
-}
-
-void *malloc_db (uint32_t size)
-{
-    if (size)
-        return add_ptr (malloc (size));
-    else
-        return NULL;
-}
-
-void free_db (void *ptr)
-{
-    if (ptr)
-        free (del_ptr (ptr));
-}
-
-void *realloc_db (void *ptr, uint32_t size)
-{
-    if (ptr && size)
-        return add_ptr (realloc (del_ptr (ptr), size));
-    else if (size)
-        return malloc_db (size);
-    else
-        free_db (ptr);
-
-    return NULL;
-}
-
-int32_t dump_alloc (void)
-{
-    int i, j;
-
-    for (j = i = 0; i < 512; ++i)
-        if (vptrs [i])
-            j++;
-
-    return j;
-}
-
-#endif
diff --git a/third_party/wavpack/src/common_utils.c b/third_party/wavpack/src/common_utils.c
new file mode 100644
index 0000000..c53db91
--- /dev/null
+++ b/third_party/wavpack/src/common_utils.c
@@ -0,0 +1,771 @@
+////////////////////////////////////////////////////////////////////////////
+//                           **** WAVPACK ****                            //
+//                  Hybrid Lossless Wavefile Compressor                   //
+//              Copyright (c) 1998 - 2013 Conifer Software.               //
+//                          All Rights Reserved.                          //
+//      Distributed under the BSD Software License (see license.txt)      //
+////////////////////////////////////////////////////////////////////////////
+
+// common_utils.c
+
+// This module provides a lot of the trivial WavPack API functions and several
+// functions that are common to both reading and writing WavPack files (like
+// WavpackCloseFile()). Functions here are restricted to those that have few
+// external dependancies and this is done so that applications that statically
+// link to the WavPack library (like the command-line utilities on Windows)
+// do not need to include the entire library image if they only use a subset
+// of it. This module will be loaded for ANY WavPack application.
+
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "wavpack_local.h"
+
+#ifndef LIBWAVPACK_VERSION_STRING
+#include "wavpack_version.h"
+#endif
+
+///////////////////////////// local table storage ////////////////////////////
+
+const uint32_t sample_rates [] = { 6000, 8000, 9600, 11025, 12000, 16000, 22050,
+    24000, 32000, 44100, 48000, 64000, 88200, 96000, 192000 };
+
+///////////////////////////// executable code ////////////////////////////////
+
+// This function obtains general information about an open input file and
+// returns a mask with the following bit values:
+
+// MODE_WVC:  a .wvc file has been found and will be used for lossless
+// MODE_LOSSLESS:  file is lossless (either pure or hybrid)
+// MODE_HYBRID:  file is hybrid mode (either lossy or lossless)
+// MODE_FLOAT:  audio data is 32-bit ieee floating point
+// MODE_VALID_TAG:  file conatins a valid ID3v1 or APEv2 tag
+// MODE_HIGH:  file was created in "high" mode (information only)
+// MODE_FAST:  file was created in "fast" mode (information only)
+// MODE_EXTRA:  file was created using "extra" mode (information only)
+// MODE_APETAG:  file contains a valid APEv2 tag
+// MODE_SFX:  file was created as a "self-extracting" executable
+// MODE_VERY_HIGH:  file was created in the "very high" mode (or in
+//                  the "high" mode prior to 4.4)
+// MODE_MD5:  file contains an MD5 checksum
+// MODE_XMODE:  level used for extra mode (1-6, 0=unknown)
+// MODE_DNS:  dynamic noise shaping
+
+int WavpackGetMode (WavpackContext *wpc)
+{
+    int mode = 0;
+
+    if (wpc) {
+        if (wpc->config.flags & CONFIG_HYBRID_FLAG)
+            mode |= MODE_HYBRID;
+        else if (!(wpc->config.flags & CONFIG_LOSSY_MODE))
+            mode |= MODE_LOSSLESS;
+
+        if (wpc->wvc_flag)
+            mode |= (MODE_LOSSLESS | MODE_WVC);
+
+        if (wpc->lossy_blocks)
+            mode &= ~MODE_LOSSLESS;
+
+        if (wpc->config.flags & CONFIG_FLOAT_DATA)
+            mode |= MODE_FLOAT;
+
+        if (wpc->config.flags & (CONFIG_HIGH_FLAG | CONFIG_VERY_HIGH_FLAG)) {
+            mode |= MODE_HIGH;
+
+            if ((wpc->config.flags & CONFIG_VERY_HIGH_FLAG) ||
+                (wpc->streams && wpc->streams [0] && wpc->streams [0]->wphdr.version < 0x405))
+                    mode |= MODE_VERY_HIGH;
+        }
+
+        if (wpc->config.flags & CONFIG_FAST_FLAG)
+            mode |= MODE_FAST;
+
+        if (wpc->config.flags & CONFIG_EXTRA_MODE)
+            mode |= (MODE_EXTRA | (wpc->config.xmode << 12));
+
+        if (wpc->config.flags & CONFIG_CREATE_EXE)
+            mode |= MODE_SFX;
+
+        if (wpc->config.flags & CONFIG_MD5_CHECKSUM)
+            mode |= MODE_MD5;
+
+        if ((wpc->config.flags & CONFIG_HYBRID_FLAG) && (wpc->config.flags & CONFIG_DYNAMIC_SHAPING) &&
+            wpc->streams && wpc->streams [0] && wpc->streams [0]->wphdr.version >= 0x407)
+                mode |= MODE_DNS;
+
+#ifndef NO_TAGS
+        if (valid_tag (&wpc->m_tag)) {
+            mode |= MODE_VALID_TAG;
+
+            if (valid_tag (&wpc->m_tag) == 'A')
+                mode |= MODE_APETAG;
+        }
+#endif
+
+        mode |= (wpc->config.qmode << 16) & 0xFF0000;
+    }
+
+    return mode;
+}
+
+// This function obtains information about specific file features that were
+// added for version 5.0, specifically qualifications added to support CAF
+// and DSD files. Except for indicating the presence of DSD data, these
+// bits are meant to simply indicate the format of the data in the original
+// source file and do NOT indicate how the library will return the data to
+// the appication (which is always the same). This means that in general an
+// application that simply wants to play or process the audio data need not
+// be concerned about these. If the file is DSD audio, then either of the
+// QMDOE_DSD_LSB_FIRST or QMODE_DSD_MSB_FIRST bits will be set (but the
+// DSD audio is always returned to the caller MSB first).
+
+// QMODE_BIG_ENDIAN        0x1     // big-endian data format (opposite of WAV format)
+// QMODE_SIGNED_BYTES      0x2     // 8-bit audio data is signed (opposite of WAV format)
+// QMODE_UNSIGNED_WORDS    0x4     // audio data (other than 8-bit) is unsigned (opposite of WAV format)
+// QMODE_REORDERED_CHANS   0x8     // source channels were not Microsoft order, so they were reordered
+// QMODE_DSD_LSB_FIRST     0x10    // DSD bytes, LSB first (most Sony .dsf files)
+// QMODE_DSD_MSB_FIRST     0x20    // DSD bytes, MSB first (Philips .dff files)
+// QMODE_DSD_IN_BLOCKS     0x40    // DSD data is blocked by channels (Sony .dsf only)
+
+int WavpackGetQualifyMode (WavpackContext *wpc)
+{
+    return wpc->config.qmode & 0xFF;
+}
+
+// This function returns a pointer to a string describing the last error
+// generated by WavPack.
+
+char *WavpackGetErrorMessage (WavpackContext *wpc)
+{
+    return wpc->error_message;
+}
+
+// Get total number of samples contained in the WavPack file, or -1 if unknown
+
+uint32_t WavpackGetNumSamples (WavpackContext *wpc)
+{
+    return (uint32_t) WavpackGetNumSamples64 (wpc);
+}
+
+int64_t WavpackGetNumSamples64 (WavpackContext *wpc)
+{
+    return wpc ? wpc->total_samples : -1;
+}
+
+// Get the current sample index position, or -1 if unknown
+
+uint32_t WavpackGetSampleIndex (WavpackContext *wpc)
+{
+    return (uint32_t) WavpackGetSampleIndex64 (wpc);
+}
+
+int64_t WavpackGetSampleIndex64 (WavpackContext *wpc)
+{
+    if (wpc) {
+#ifdef ENABLE_LEGACY
+        if (wpc->stream3)
+            return get_sample_index3 (wpc);
+        else if (wpc->streams && wpc->streams [0])
+            return wpc->streams [0]->sample_index;
+#else
+        if (wpc->streams && wpc->streams [0])
+            return wpc->streams [0]->sample_index;
+#endif
+    }
+
+    return -1;
+}
+
+// Get the number of errors encountered so far
+
+int WavpackGetNumErrors (WavpackContext *wpc)
+{
+    return wpc ? wpc->crc_errors : 0;
+}
+
+// return TRUE if any uncorrected lossy blocks were actually written or read
+
+int WavpackLossyBlocks (WavpackContext *wpc)
+{
+    return wpc ? wpc->lossy_blocks : 0;
+}
+
+// Calculate the progress through the file as a double from 0.0 (for begin)
+// to 1.0 (for done). A return value of -1.0 indicates that the progress is
+// unknown.
+
+double WavpackGetProgress (WavpackContext *wpc)
+{
+    if (wpc && wpc->total_samples != -1 && wpc->total_samples != 0)
+        return (double) WavpackGetSampleIndex64 (wpc) / wpc->total_samples;
+    else
+        return -1.0;
+}
+
+// Return the total size of the WavPack file(s) in bytes.
+
+uint32_t WavpackGetFileSize (WavpackContext *wpc)
+{
+    return (uint32_t) (wpc ? wpc->filelen + wpc->file2len : 0);
+}
+
+int64_t WavpackGetFileSize64 (WavpackContext *wpc)
+{
+    return wpc ? wpc->filelen + wpc->file2len : 0;
+}
+
+// Calculate the ratio of the specified WavPack file size to the size of the
+// original audio data as a double greater than 0.0 and (usually) smaller than
+// 1.0. A value greater than 1.0 represents "negative" compression and a
+// return value of 0.0 indicates that the ratio cannot be determined.
+
+double WavpackGetRatio (WavpackContext *wpc)
+{
+    if (wpc && wpc->total_samples != -1 && wpc->filelen) {
+        double output_size = (double) wpc->total_samples * wpc->config.num_channels *
+            wpc->config.bytes_per_sample;
+        double input_size = (double) wpc->filelen + wpc->file2len;
+
+        if (output_size >= 1.0 && input_size >= 1.0)
+            return input_size / output_size;
+    }
+
+    return 0.0;
+}
+
+// Calculate the average bitrate of the WavPack file in bits per second. A
+// return of 0.0 indicates that the bitrate cannot be determined. An option is
+// provided to use (or not use) any attendant .wvc file.
+
+double WavpackGetAverageBitrate (WavpackContext *wpc, int count_wvc)
+{
+    if (wpc && wpc->total_samples != -1 && wpc->filelen) {
+        double output_time = (double) wpc->total_samples / WavpackGetSampleRate (wpc);
+        double input_size = (double) wpc->filelen + (count_wvc ? wpc->file2len : 0);
+
+        if (output_time >= 0.1 && input_size >= 1.0)
+            return input_size * 8.0 / output_time;
+    }
+
+    return 0.0;
+}
+
+// Calculate the bitrate of the current WavPack file block in bits per second.
+// This can be used for an "instant" bit display and gets updated from about
+// 1 to 4 times per second. A return of 0.0 indicates that the bitrate cannot
+// be determined.
+
+double WavpackGetInstantBitrate (WavpackContext *wpc)
+{
+    if (wpc && wpc->stream3)
+        return WavpackGetAverageBitrate (wpc, TRUE);
+
+    if (wpc && wpc->streams && wpc->streams [0] && wpc->streams [0]->wphdr.block_samples) {
+        double output_time = (double) wpc->streams [0]->wphdr.block_samples / WavpackGetSampleRate (wpc);
+        double input_size = 0;
+        int si;
+
+        for (si = 0; si < wpc->num_streams; ++si) {
+            if (wpc->streams [si]->blockbuff)
+                input_size += ((WavpackHeader *) wpc->streams [si]->blockbuff)->ckSize;
+
+            if (wpc->streams [si]->block2buff)
+                input_size += ((WavpackHeader *) wpc->streams [si]->block2buff)->ckSize;
+        }
+
+        if (output_time > 0.0 && input_size >= 1.0)
+            return input_size * 8.0 / output_time;
+    }
+
+    return 0.0;
+}
+
+// This function allows retrieving the Core Audio File channel layout, many of which do not
+// conform to the Microsoft ordering standard that WavPack requires internally (at least for
+// those channels present in the "channel mask"). In addition to the layout tag, this function
+// returns the reordering string (if stored in the file) to allow the unpacker to reorder the
+// channels back to the specified layout (if it wants to restore the CAF order). The number of
+// channels in the layout is determined from the lower nybble of the layout word (and should
+// probably match the number of channels in the file), and if a reorder string is requested
+// then that much space must be allocated. Note that all the reordering is actually done
+// outside of this library, and that if reordering is done then the appropriate qmode bit
+// will be set.
+//
+// Note: Normally this function would not be used by an application unless it specifically
+// wanted to restore a non-standard channel order (to check an MD5, for example) or obtain
+// the Core Audio channel layout ID. For simple file decoding for playback, the channel_mask
+// should provide all the information required unless there are non-Microsoft channels
+// involved, in which case WavpackGetChannelIdentities() will provide the identities of
+// the other channels (if they are known).
+
+uint32_t WavpackGetChannelLayout (WavpackContext *wpc, unsigned char *reorder)
+{
+    if ((wpc->channel_layout & 0xff) && wpc->channel_reordering && reorder)
+        memcpy (reorder, wpc->channel_reordering, wpc->channel_layout & 0xff);
+
+    return wpc->channel_layout;
+}
+
+// This function provides the identities of ALL the channels in the file, including the
+// standard Microsoft channels (which come first, in order, and are numbered 1-18) and also
+// any non-Microsoft channels (which can be in any order and have values from 33-254). The
+// value 0x00 is invalid and 0xFF indicates an "unknown" or "unnassigned" channel. The
+// string is NULL terminated so the caller must supply enough space for the number
+// of channels indicated by WavpackGetNumChannels(), plus one.
+//
+// Note that this function returns the actual order of the channels in the Wavpack file
+// (i.e., the order returned by WavpackUnpackSamples()). If the file includes a "reordering"
+// string because the source file was not in Microsoft order that is NOT taken into account
+// here and really only needs to be considered if doing an MD5 verification or if it's
+// required to restore the original order/file (like wvunpack does).
+
+void WavpackGetChannelIdentities (WavpackContext *wpc, unsigned char *identities)
+{
+    int num_channels = wpc->config.num_channels, index = 1;
+    uint32_t channel_mask = wpc->config.channel_mask;
+    unsigned char *src = wpc->channel_identities;
+
+    while (num_channels--) {
+        if (channel_mask) {
+            while (!(channel_mask & 1)) {
+                channel_mask >>= 1;
+                index++;
+            }
+
+            *identities++ = index++;
+            channel_mask >>= 1;
+        }
+        else if (src && *src)
+            *identities++ = *src++;
+        else
+            *identities++ = 0xff;
+    }
+
+    *identities = 0;
+}
+
+// For local use only. Install a callback to be executed when WavpackCloseFile() is called,
+// usually used to dump some statistics accumulated during encode or decode.
+
+void install_close_callback (WavpackContext *wpc, void cb_func (void *wpc))
+{
+    wpc->close_callback = cb_func;
+}
+
+// Close the specified WavPack file and release all resources used by it.
+// Returns NULL.
+
+WavpackContext *WavpackCloseFile (WavpackContext *wpc)
+{
+    if (wpc->close_callback)
+        wpc->close_callback (wpc);
+
+    if (wpc->streams) {
+        free_streams (wpc);
+
+        if (wpc->streams [0])
+            free (wpc->streams [0]);
+
+        free (wpc->streams);
+    }
+
+#ifdef ENABLE_LEGACY
+    if (wpc->stream3)
+        free_stream3 (wpc);
+#endif
+
+    if (wpc->reader && wpc->reader->close && wpc->wv_in)
+        wpc->reader->close (wpc->wv_in);
+
+    if (wpc->reader && wpc->reader->close && wpc->wvc_in)
+        wpc->reader->close (wpc->wvc_in);
+
+    WavpackFreeWrapper (wpc);
+
+    if (wpc->channel_reordering)
+        free (wpc->channel_reordering);
+
+#ifndef NO_TAGS
+    free_tag (&wpc->m_tag);
+#endif
+
+#ifdef ENABLE_DSD
+    if (wpc->decimation_context)
+        decimate_dsd_destroy (wpc->decimation_context);
+#endif
+
+    free (wpc);
+
+    return NULL;
+}
+
+// These routines are used to access (and free) header and trailer data that
+// was retrieved from the Wavpack file. The header will be available before
+// the samples are decoded and the trailer will be available after all samples
+// have been read.
+
+uint32_t WavpackGetWrapperBytes (WavpackContext *wpc)
+{
+    return wpc ? wpc->wrapper_bytes : 0;
+}
+
+unsigned char *WavpackGetWrapperData (WavpackContext *wpc)
+{
+    return wpc ? wpc->wrapper_data : NULL;
+}
+
+void WavpackFreeWrapper (WavpackContext *wpc)
+{
+    if (wpc && wpc->wrapper_data) {
+        free (wpc->wrapper_data);
+        wpc->wrapper_data = NULL;
+        wpc->wrapper_bytes = 0;
+    }
+}
+
+// Returns the sample rate of the specified WavPack file
+
+uint32_t WavpackGetSampleRate (WavpackContext *wpc)
+{
+    return wpc ? (wpc->dsd_multiplier ? wpc->config.sample_rate * wpc->dsd_multiplier : wpc->config.sample_rate) : 44100;
+}
+
+// Returns the native sample rate of the specified WavPack file
+// (provides the native rate for DSD files rather than the "byte" rate that's used for
+//   seeking, duration, etc. and would generally be used just for user facing reports)
+
+uint32_t WavpackGetNativeSampleRate (WavpackContext *wpc)
+{
+    return wpc ? (wpc->dsd_multiplier ? wpc->config.sample_rate * wpc->dsd_multiplier * 8 : wpc->config.sample_rate) : 44100;
+}
+
+// Returns the number of channels of the specified WavPack file. Note that
+// this is the actual number of channels contained in the file even if the
+// OPEN_2CH_MAX flag was specified when the file was opened.
+
+int WavpackGetNumChannels (WavpackContext *wpc)
+{
+    return wpc ? wpc->config.num_channels : 2;
+}
+
+// Returns the standard Microsoft channel mask for the specified WavPack
+// file. A value of zero indicates that there is no speaker assignment
+// information.
+
+int WavpackGetChannelMask (WavpackContext *wpc)
+{
+    return wpc ? wpc->config.channel_mask : 0;
+}
+
+// Return the normalization value for floating point data (valid only
+// if floating point data is present). A value of 127 indicates that
+// the floating point range is +/- 1.0. Higher values indicate a
+// larger floating point range.
+
+int WavpackGetFloatNormExp (WavpackContext *wpc)
+{
+    return wpc->config.float_norm_exp;
+}
+
+// Returns the actual number of valid bits per sample contained in the
+// original file, which may or may not be a multiple of 8. Floating data
+// always has 32 bits, integers may be from 1 to 32 bits each. When this
+// value is not a multiple of 8, then the "extra" bits are located in the
+// LSBs of the results. That is, values are right justified when unpacked
+// into ints, but are left justified in the number of bytes used by the
+// original data.
+
+int WavpackGetBitsPerSample (WavpackContext *wpc)
+{
+    return wpc ? wpc->config.bits_per_sample : 16;
+}
+
+// Returns the number of bytes used for each sample (1 to 4) in the original
+// file. This is required information for the user of this module because the
+// audio data is returned in the LOWER bytes of the long buffer and must be
+// left-shifted 8, 16, or 24 bits if normalized longs are required.
+
+int WavpackGetBytesPerSample (WavpackContext *wpc)
+{
+    return wpc ? wpc->config.bytes_per_sample : 2;
+}
+
+// If the OPEN_2CH_MAX flag is specified when opening the file, this function
+// will return the actual number of channels decoded from the file (which may
+// or may not be less than the actual number of channels, but will always be
+// 1 or 2). Normally, this will be the front left and right channels of a
+// multichannel file.
+
+int WavpackGetReducedChannels (WavpackContext *wpc)
+{
+    if (wpc)
+        return wpc->reduced_channels ? wpc->reduced_channels : wpc->config.num_channels;
+    else
+        return 2;
+}
+
+// Free all memory allocated for raw WavPack blocks (for all allocated streams)
+// and free all additonal streams. This does not free the default stream ([0])
+// which is always kept around.
+
+void free_streams (WavpackContext *wpc)
+{
+    int si = wpc->num_streams;
+
+    while (si--) {
+        if (wpc->streams [si]->blockbuff) {
+            free (wpc->streams [si]->blockbuff);
+            wpc->streams [si]->blockbuff = NULL;
+        }
+
+        if (wpc->streams [si]->block2buff) {
+            free (wpc->streams [si]->block2buff);
+            wpc->streams [si]->block2buff = NULL;
+        }
+
+        if (wpc->streams [si]->sample_buffer) {
+            free (wpc->streams [si]->sample_buffer);
+            wpc->streams [si]->sample_buffer = NULL;
+        }
+
+        if (wpc->streams [si]->dc.shaping_data) {
+            free (wpc->streams [si]->dc.shaping_data);
+            wpc->streams [si]->dc.shaping_data = NULL;
+        }
+
+#ifdef ENABLE_DSD
+        if (wpc->streams [si]->dsd.probabilities) {
+            free (wpc->streams [si]->dsd.probabilities);
+            wpc->streams [si]->dsd.probabilities = NULL;
+        }
+
+        if (wpc->streams [si]->dsd.summed_probabilities) {
+            free (wpc->streams [si]->dsd.summed_probabilities);
+            wpc->streams [si]->dsd.summed_probabilities = NULL;
+        }
+
+        if (wpc->streams [si]->dsd.value_lookup) {
+            int i;
+
+            for (i = 0; i < wpc->streams [si]->dsd.history_bins; ++i)
+                if (wpc->streams [si]->dsd.value_lookup [i])
+                    free (wpc->streams [si]->dsd.value_lookup [i]);
+
+            free (wpc->streams [si]->dsd.value_lookup);
+            wpc->streams [si]->dsd.value_lookup = NULL;
+        }
+
+        if (wpc->streams [si]->dsd.ptable) {
+            free (wpc->streams [si]->dsd.ptable);
+            wpc->streams [si]->dsd.ptable = NULL;
+        }
+#endif
+
+        if (si) {
+            wpc->num_streams--;
+            free (wpc->streams [si]);
+            wpc->streams [si] = NULL;
+        }
+    }
+
+    wpc->current_stream = 0;
+}
+
+void WavpackFloatNormalize (int32_t *values, int32_t num_values, int delta_exp)
+{
+    f32 *fvalues = (f32 *) values;
+    int exp;
+
+    if (!delta_exp)
+        return;
+
+    while (num_values--) {
+        if ((exp = get_exponent (*fvalues)) == 0 || exp + delta_exp <= 0)
+            *fvalues = 0;
+        else if (exp == 255 || (exp += delta_exp) >= 255) {
+            set_exponent (*fvalues, 255);
+            set_mantissa (*fvalues, 0);
+        }
+        else
+            set_exponent (*fvalues, exp);
+
+        fvalues++;
+    }
+}
+
+void WavpackLittleEndianToNative (void *data, char *format)
+{
+    unsigned char *cp = (unsigned char *) data;
+    int64_t temp;
+
+    while (*format) {
+        switch (*format) {
+            case 'D':
+                temp = cp [0] + ((int64_t) cp [1] << 8) + ((int64_t) cp [2] << 16) + ((int64_t) cp [3] << 24) +
+                    ((int64_t) cp [4] << 32) + ((int64_t) cp [5] << 40) + ((int64_t) cp [6] << 48) + ((int64_t) cp [7] << 56);
+                * (int64_t *) cp = temp;
+                cp += 8;
+                break;
+
+            case 'L':
+                temp = cp [0] + ((int32_t) cp [1] << 8) + ((int32_t) cp [2] << 16) + ((int32_t) cp [3] << 24);
+                * (int32_t *) cp = (int32_t) temp;
+                cp += 4;
+                break;
+
+            case 'S':
+                temp = cp [0] + (cp [1] << 8);
+                * (int16_t *) cp = (int16_t) temp;
+                cp += 2;
+                break;
+
+            default:
+                if (isdigit (*format))
+                    cp += *format - '0';
+
+                break;
+        }
+
+        format++;
+    }
+}
+
+void WavpackNativeToLittleEndian (void *data, char *format)
+{
+    unsigned char *cp = (unsigned char *) data;
+    int64_t temp;
+
+    while (*format) {
+        switch (*format) {
+            case 'D':
+                temp = * (int64_t *) cp;
+                *cp++ = (unsigned char) temp;
+                *cp++ = (unsigned char) (temp >> 8);
+                *cp++ = (unsigned char) (temp >> 16);
+                *cp++ = (unsigned char) (temp >> 24);
+                *cp++ = (unsigned char) (temp >> 32);
+                *cp++ = (unsigned char) (temp >> 40);
+                *cp++ = (unsigned char) (temp >> 48);
+                *cp++ = (unsigned char) (temp >> 56);
+                break;
+
+            case 'L':
+                temp = * (int32_t *) cp;
+                *cp++ = (unsigned char) temp;
+                *cp++ = (unsigned char) (temp >> 8);
+                *cp++ = (unsigned char) (temp >> 16);
+                *cp++ = (unsigned char) (temp >> 24);
+                break;
+
+            case 'S':
+                temp = * (int16_t *) cp;
+                *cp++ = (unsigned char) temp;
+                *cp++ = (unsigned char) (temp >> 8);
+                break;
+
+            default:
+                if (isdigit (*format))
+                    cp += *format - '0';
+
+                break;
+        }
+
+        format++;
+    }
+}
+
+void WavpackBigEndianToNative (void *data, char *format)
+{
+    unsigned char *cp = (unsigned char *) data;
+    int64_t temp;
+
+    while (*format) {
+        switch (*format) {
+            case 'D':
+                temp = cp [7] + ((int64_t) cp [6] << 8) + ((int64_t) cp [5] << 16) + ((int64_t) cp [4] << 24) +
+                    ((int64_t) cp [3] << 32) + ((int64_t) cp [2] << 40) + ((int64_t) cp [1] << 48) + ((int64_t) cp [0] << 56);
+                * (int64_t *) cp = temp;
+                cp += 8;
+                break;
+
+            case 'L':
+                temp = cp [3] + ((int32_t) cp [2] << 8) + ((int32_t) cp [1] << 16) + ((int32_t) cp [0] << 24);
+                * (int32_t *) cp = (int32_t) temp;
+                cp += 4;
+                break;
+
+            case 'S':
+                temp = cp [1] + (cp [0] << 8);
+                * (int16_t *) cp = (int16_t) temp;
+                cp += 2;
+                break;
+
+            default:
+                if (isdigit (*format))
+                    cp += *format - '0';
+
+                break;
+        }
+
+        format++;
+    }
+}
+
+void WavpackNativeToBigEndian (void *data, char *format)
+{
+    unsigned char *cp = (unsigned char *) data;
+    int64_t temp;
+
+    while (*format) {
+        switch (*format) {
+            case 'D':
+                temp = * (int64_t *) cp;
+                *cp++ = (unsigned char) (temp >> 56);
+                *cp++ = (unsigned char) (temp >> 48);
+                *cp++ = (unsigned char) (temp >> 40);
+                *cp++ = (unsigned char) (temp >> 32);
+                *cp++ = (unsigned char) (temp >> 24);
+                *cp++ = (unsigned char) (temp >> 16);
+                *cp++ = (unsigned char) (temp >> 8);
+                *cp++ = (unsigned char) temp;
+                break;
+
+            case 'L':
+                temp = * (int32_t *) cp;
+                *cp++ = (unsigned char) (temp >> 24);
+                *cp++ = (unsigned char) (temp >> 16);
+                *cp++ = (unsigned char) (temp >> 8);
+                *cp++ = (unsigned char) temp;
+                break;
+
+            case 'S':
+                temp = * (int16_t *) cp;
+                *cp++ = (unsigned char) (temp >> 8);
+                *cp++ = (unsigned char) temp;
+                break;
+
+            default:
+                if (isdigit (*format))
+                    cp += *format - '0';
+
+                break;
+        }
+
+        format++;
+    }
+}
+
+uint32_t WavpackGetLibraryVersion (void)
+{
+    return (LIBWAVPACK_MAJOR<<16)
+          |(LIBWAVPACK_MINOR<<8)
+          |(LIBWAVPACK_MICRO<<0);
+}
+
+const char *WavpackGetLibraryVersionString (void)
+{
+    return LIBWAVPACK_VERSION_STRING;
+}
+
diff --git a/third_party/wavpack/src/decorr_tables.h b/third_party/wavpack/src/decorr_tables.h
new file mode 100644
index 0000000..be17de9
--- /dev/null
+++ b/third_party/wavpack/src/decorr_tables.h
@@ -0,0 +1,1077 @@
+////////////////////////////////////////////////////////////////////////////
+//                           **** WAVPACK ****                            //
+//                  Hybrid Lossless Wavefile Compressor                   //
+//              Copyright (c) 1998 - 2013 Conifer Software.               //
+//                          All Rights Reserved.                          //
+//      Distributed under the BSD Software License (see license.txt)      //
+////////////////////////////////////////////////////////////////////////////
+
+// decorr_tables.h
+
+// These four tables specify the characteristics of the decorrelation filters
+// for the four basic compression modes (fast, normal, high, and very high).
+//
+// The first entry in the table represents the "default" filter for the
+// corresponding mode; subsequent entries represent filters that are tried
+// in the "extra" modes 1-3 ("extra" modes 4-6 create filters from scratch).
+//
+// The first value indicates whether the filter is applied to joint stereo
+// data (0=L/R, 1=M/S) and the second value represents the "delta" value of
+// the adaptive filter. The rest of the values (2-16, depending on mode) are
+// the "terms" of the filter.
+//
+// Each term represents one layer of the sequential filter, where positive
+// values indicate the relative sample involved from the same channel (1=prev),
+// 17 & 18 are special functions using the previous 2 samples, and negative
+// values indicate cross channel decorrelation (in stereo only).
+//
+// It would be ideal if this was the only source for the decorrelation tables,
+// but unfortunately the defaults (first entry) are duplicated in the assembly
+// code for the function pack_decorr_mono_buffer() and there is no check in
+// that code to make sure the correct filter is being passed in. SO, IF A
+// CHANGE IS MADE HERE TO ONE OF THE DEFAULT FILTERS, THEN THE CORRESPONDING
+// ASSEMBLY CODE MUST BE CHANGED ALSO, OR VERY CORRUPT FILES WILL RESULT!!
+//
+// Since this include file contains that actual tables as static const data,
+// it should only be included from ONE source file (currently pack.c)!
+
+static const WavpackDecorrSpec fast_specs [] = {
+        { 1, 2,18,17 },  // 0
+        { 1, 1,17,17 },  // 1
+        { 0, 2,18,17 },  // 2
+        { 0, 1,17,17 },  // 3
+        { 1, 3, 1,18 },  // 4
+        { 1, 1,17, 1 },  // 5
+        { 0, 1, 1,17 },  // 6
+        { 0, 1,-2,17 },  // 7
+        { 0, 2,-1,17 },  // 8
+        { 1, 1,17, 2 },  // 9
+        { 0, 3,18,18 },  // 10
+        { 0, 1,17, 1 },  // 11
+        { 1, 6, 1, 2 },  // 12
+        { 1, 1,17, 3 },  // 13
+        { 0, 1,-2, 3 },  // 14
+        { 0, 1, 2,17 },  // 15
+        { 0, 1,18,-2 },  // 16
+        { 0, 1,-1,17 },  // 17
+        { 0, 1,18,17 },  // 18
+        { 0, 1,17, 2 },  // 19
+        { 1, 2,18,-2 },  // 20
+        { 1, 1, 1,17 },  // 21
+        { 0, 3,18, 2 },  // 22
+        { 0, 1,17,-2 },  // 23
+        { 0, 1,18,-2 },  // 24
+        { 1, 2,17,-3 },  // 25
+        { 0, 1,18, 3 },  // 26
+        { 0, 1,18,18 },  // 27
+        { 1, 1, 1, 3 },  // 28
+        { 1, 1,18, 3 },  // 29
+        { 1, 1, 1, 3 },  // 30
+        { 0, 2,18,17 },  // 31
+        { 1, 1, 1,17 },  // 32
+        { 1, 1,17, 3 },  // 33
+        { 0, 3,18,17 },  // 34
+        { 0, 1,18,18 },  // 35
+        { 1, 1, 1, 3 },  // 36
+        { 1, 1, 1,18 },  // 37
+        { 0, 1,18,-2 },  // 38
+        { 0, 2,18,17 },  // 39
+        { 0, 1,-1,18 },  // 40
+        { 1, 1,17, 3 },  // 41
+        { 0, 1,17, 2 },  // 42
+        { 0, 1,17, 3 },  // 43
+        { 1, 1,18, 2 },  // 44
+        { 1, 1,17,-2 },  // 45
+        { 0, 1, 1,-2 },  // 46
+        { 0, 2,18,17 },  // 47
+        { 0, 1,17,-2 },  // 48
+        { 1, 1,17,-2 },  // 49
+        { 0, 1,18, 3 },  // 50
+        { 0, 1, 2,17 },  // 51
+        { 1, 2,18,-3 },  // 52
+        { 1, 2, 1,18 },  // 53
+        { 1, 2,18, 2 },  // 54
+        { 0, 1,17,-1 },  // 55
+        { 0, 1,17,-2 },  // 56
+        { 1, 1,17,-2 },  // 57
+        { 1, 1, 1, 3 },  // 58
+        { 0, 1, 1,17 },  // 59
+        { 1, 2,18,-2 },  // 60
+        { 1, 2,17,-3 },  // 61
+        { 0, 2,18,17 },  // 62
+        { 0, 2,18,17 },  // 63
+        { 1, 1,17, 2 },  // 64
+        { 1, 2,18,18 },  // 65
+        { 0, 1,17, 2 },  // 66
+        { 0, 1,18,17 },  // 67
+        { 1, 1, 1,17 },  // 68
+        { 1, 1,17, 2 },  // 69
+        { 0, 2,18,18 },  // 70
+        { 0, 2,18,17 },  // 71
+        { 1, 2,17,-3 },  // 72
+        { 1, 6, 1, 2 },  // 73
+        { 0, 3,17,17 },  // 74
+        { 0, 1, 1,18 },  // 75
+        { 0, 1, 1,-2 },  // 76
+        { 1, 1,17, 2 },  // 77
+        { 0, 2,18,17 },  // 78
+        { 0, 2,18,17 },  // 79
+        { 1, 1,18, 3 },  // 80
+        { 1, 2,17,-3 },  // 81
+        { 0, 1,17, 2 },  // 82
+        { 0, 1,17, 3 },  // 83
+        { 0, 1,18,-2 },  // 84
+        { 1, 1,18,18 },  // 85
+        { 1, 6, 1, 2 },  // 86
+        { 0, 2,18,17 },  // 87
+        { 0, 2,18,17 },  // 88
+        { 0, 1,-1,17 },  // 89
+        { 1, 1,18, 3 },  // 90
+        { 0, 1,17,18 },  // 91
+        { 1, 1,17, 3 },  // 92
+        { 0, 1,18, 3 },  // 93
+        { 0, 2,18,17 },  // 94
+        { 0, 2,18,17 },  // 95
+        { 1, 2,18, 2 },  // 96
+        { 0, 1,-2, 3 },  // 97
+        { 0, 4,18,-1 },  // 98
+        { 0, 2,18,18 },  // 99
+        { 0, 1,-2, 3 },  // 100
+        { 1, 1,17,-2 },  // 101
+        { 0, 1,17, 3 },  // 102
+        { 0, 2,18,17 },  // 103
+        { 0, 2,-1,18 },  // 104
+        { 1, 1, 2,17 },  // 105
+        { 0, 2,17,-2 },  // 106
+        { 0, 1,17, 2 },  // 107
+        { 1, 2,18,-3 },  // 108
+        { 0, 1,17,-2 },  // 109
+        { 0, 2,18,17 },  // 110
+        { 0, 2,18,17 },  // 111
+        { 1, 1,17,-2 },  // 112
+        { 1, 2,17,-3 },  // 113
+        { 1, 1, 1, 3 },  // 114
+        { 1, 1, 2,17 },  // 115
+        { 1, 2,18, 2 },  // 116
+        { 1, 1, 2,17 },  // 117
+        { 1, 1,18, 2 },  // 118
+        { 0, 2,18,17 },  // 119
+        { 0, 2,18,17 },  // 120
+        { 0, 1,17,-2 },  // 121
+        { 0, 2,18,17 },  // 122
+        { 0, 2,17,-1 },  // 123
+        { 0, 2,18,-2 },  // 124
+        { 0, 2,18,17 },  // 125
+        { 0, 2,18,17 },  // 126
+        { 0, 2,18,17 },  // 127
+        { 1, 1, 1, 3 },  // 128
+        { 0, 2,-2,17 },  // 129
+        { 0, 2,18,-2 },  // 130
+        { 0, 2,17,-2 },  // 131
+        { 1, 1, 2,17 },  // 132
+        { 1, 1, 1, 3 },  // 133
+        { 0, 1, 2,17 },  // 134
+        { 0, 2,18,17 },  // 135
+        { 0, 3,-1,17 },  // 136
+        { 1, 1, 2,17 },  // 137
+        { 0, 2,18,18 },  // 138
+        { 0, 1,17, 2 },  // 139
+        { 1, 4,18,-3 },  // 140
+        { 1, 1,18, 1 },  // 141
+        { 0, 2,18,17 },  // 142
+        { 0, 2,18,17 },  // 143
+        { 1, 2,18,-1 },  // 144
+        { 0, 1,-1,18 },  // 145
+        { 1, 6, 1, 2 },  // 146
+        { 1, 1,17, 2 },  // 147
+        { 1, 4,18, 3 },  // 148
+        { 0, 1, 1,17 },  // 149
+        { 0, 1,18, 2 },  // 150
+        { 0, 2,18,17 },  // 151
+        { 0, 2,18,17 },  // 152
+        { 1, 2,17, 2 },  // 153
+        { 0, 2,18,-2 },  // 154
+        { 0, 1, 1,18 },  // 155
+        { 1, 2,18,-3 },  // 156
+        { 0, 2,18,17 },  // 157
+        { 0, 2,18,17 },  // 158
+        { 0, 2,18,17 },  // 159
+        { 1, 2,18,18 },  // 160
+        { 1, 3,17,17 },  // 161
+        { 0, 1,-2,17 },  // 162
+        { 0, 1,17,18 },  // 163
+        { 0, 1,-1, 3 },  // 164
+        { 1, 1, 2,17 },  // 165
+        { 0, 2,18,-1 },  // 166
+        { 0, 2,18,17 },  // 167
+        { 0, 2,18,17 },  // 168
+        { 1, 1,17,-2 },  // 169
+        { 1, 2,17, 2 },  // 170
+        { 1, 1,18, 3 },  // 171
+        { 0, 1,18, 2 },  // 172
+        { 1, 2,17,-3 },  // 173
+        { 0, 2,18,17 },  // 174
+        { 0, 2,18,17 },  // 175
+        { 0, 1,-2,17 },  // 176
+        { 0, 1,17,-1 },  // 177
+        { 0, 1,18,-1 },  // 178
+        { 0, 2,18,17 },  // 179
+        { 1, 2,17,-3 },  // 180
+        { 1, 1, 1,18 },  // 181
+        { 1, 3,18, 2 },  // 182
+        { 0, 2,18,17 },  // 183
+        { 0, 2,18,17 },  // 184
+        { 0, 2,18,17 },  // 185
+        { 0, 2,18,17 },  // 186
+        { 0, 3,18,18 },  // 187
+        { 0, 1, 1,-2 },  // 188
+        { 0, 2,18,17 },  // 189
+        { 0, 2,18,17 },  // 190
+        { 0, 2,18,17 },  // 191
+        { 1, 2,17,-3 },  // 192
+        { 1, 1,18,18 },  // 193
+        { 0, 2,18, 2 },  // 194
+        { 0, 1,17,18 },  // 195
+        { 1, 2,18, 2 },  // 196
+        { 1, 1,17,-2 },  // 197
+        { 0, 2,17,-1 },  // 198
+        { 0, 2,18,17 },  // 199
+        { 0, 2,18,17 },  // 200
+        { 0, 2,18,17 },  // 201
+        { 0, 1, 1,-2 },  // 202
+        { 0, 1,18, 1 },  // 203
+        { 1, 2,18,-2 },  // 204
+        { 0, 1,17, 2 },  // 205
+        { 0, 2,18,17 },  // 206
+        { 0, 2,18,17 },  // 207
+        { 1, 1,17, 3 },  // 208
+        { 0, 1,17,-1 },  // 209
+        { 0, 1,18, 2 },  // 210
+        { 1, 1,17, 3 },  // 211
+        { 1, 1,17,-2 },  // 212
+        { 0, 1,18,18 },  // 213
+        { 0, 2,18,17 },  // 214
+        { 0, 2,18,17 },  // 215
+        { 0, 2,18,17 },  // 216
+        { 0, 2,18,17 },  // 217
+        { 0, 2,18,17 },  // 218
+        { 1, 1,17,18 },  // 219
+        { 0, 1,-2, 3 },  // 220
+        { 0, 2,18,17 },  // 221
+        { 0, 2,18,17 },  // 222
+        { 0, 2,18,17 },  // 223
+        { 1, 2,18,-3 },  // 224
+        { 0, 2,18,17 },  // 225
+        { 0, 3,18, 2 },  // 226
+        { 0, 1, 1,18 },  // 227
+        { 0, 2,18,17 },  // 228
+        { 0, 1,17,-1 },  // 229
+        { 0, 2,18,17 },  // 230
+        { 0, 2,18,17 },  // 231
+        { 0, 2,18,17 },  // 232
+        { 0, 1,-2, 3 },  // 233
+        { 0, 3,17,17 },  // 234
+        { 0, 2,18,17 },  // 235
+        { 0, 2,18,17 },  // 236
+        { 1, 1,17, 2 },  // 237
+        { 0, 2,18,17 },  // 238
+        { 0, 2,18,17 },  // 239
+        { 1, 1,17, 2 },  // 240
+        { 0, 2,18,17 },  // 241
+        { 0, 2,18,17 },  // 242
+        { 0, 2,18,17 },  // 243
+        { 0, 2,18, 2 },  // 244
+        { 0, 2,18,17 },  // 245
+        { 0, 2,18,17 },  // 246
+        { 0, 2,18,17 },  // 247
+        { 0, 2,18,17 },  // 248
+        { 0, 2,18,17 },  // 249
+        { 0, 2,18,17 },  // 250
+        { 0, 2,18,17 },  // 251
+        { 0, 2,18,17 },  // 252
+        { 0, 2,18,17 },  // 253
+        { 0, 2,18,17 },  // 254
+        { 0, 2,18,17 },  // 255
+};
+
+static const WavpackDecorrSpec default_specs [] = {
+        { 1, 2,18,18, 2,17, 3 },         // 0
+        { 0, 2,18,17,-1, 3, 2 },         // 1
+        { 1, 1,17,18,18,-2, 2 },         // 2
+        { 0, 2,18,17, 3,-2,17 },         // 3
+        { 1, 2,18,17, 2,17, 3 },         // 4
+        { 0, 1,18,18,-1, 2,17 },         // 5
+        { 0, 1,17,17,-2, 2, 3 },         // 6
+        { 0, 1,18,-2,18, 2,17 },         // 7
+        { 1, 2,18,18,-1, 2, 3 },         // 8
+        { 0, 2,18,17, 3, 2, 5 },         // 9
+        { 1, 1,18,17,18, 2, 5 },         // 10
+        { 0, 1,17,17,-2, 2, 3 },         // 11
+        { 0, 1,18,-2,18, 2, 5 },         // 12
+        { 0, 1,17,-2,17, 2,-3 },         // 13
+        { 1, 1,17,-2,17, 1, 2 },         // 14
+        { 0, 1,17,17,-2, 2, 3 },         // 15
+        { 1, 1,18, 3, 1, 5, 4 },         // 16
+        { 1, 4,18,18, 2, 3,-2 },         // 17
+        { 0, 1, 1,-1,-1, 2,17 },         // 18
+        { 0, 2,18,17, 3, 2, 5 },         // 19
+        { 0, 1,18,18,18, 2,17 },         // 20
+        { 0, 1,18,17,-1, 2,18 },         // 21
+        { 1, 1,17, 3, 2, 1, 7 },         // 22
+        { 0, 2,18,-2,18, 2, 3 },         // 23
+        { 1, 3,18,-3,18, 2, 3 },         // 24
+        { 0, 3,18,17, 2, 3,17 },         // 25
+        { 1, 1,17,17, 2, 1, 4 },         // 26
+        { 0, 1,17,18,-2, 2,17 },         // 27
+        { 1, 1,18,18, 3, 5, 2 },         // 28
+        { 0, 1,17,17, 2,18, 4 },         // 29
+        { 0, 1,18,17, 1, 4, 6 },         // 30
+        { 1, 1, 3,17,18, 2,17 },         // 31
+        { 1, 1,17, 3, 2, 1, 7 },         // 32
+        { 0, 1,18,17,-1, 2, 3 },         // 33
+        { 1, 1,17,17, 2, 1, 4 },         // 34
+        { 1, 2,18,17,-1,17, 3 },         // 35
+        { 1, 2,18,17, 2, 3,-1 },         // 36
+        { 0, 2,18,18,-2, 2,17 },         // 37
+        { 0, 1,17,17, 2,18, 4 },         // 38
+        { 0, 5,-2,18,18,18, 2 },         // 39
+        { 1, 1,18,18,-1, 6, 3 },         // 40
+        { 0, 1,17,17,-2, 2, 3 },         // 41
+        { 1, 1,18,17,18, 2,17 },         // 42
+        { 0, 1,18,17, 4, 3, 1 },         // 43
+        { 0, 1,-2,18, 2, 2,18 },         // 44
+        { 1, 2,18,18,-2, 2,-1 },         // 45
+        { 1, 1,17,17, 2, 1, 4 },         // 46
+        { 0, 1,17,18,-2, 2,17 },         // 47
+        { 1, 1,17, 3, 2, 1, 7 },         // 48
+        { 1, 3,18,-3,18, 2, 3 },         // 49
+        { 1, 2,18,18,-2, 2,-1 },         // 50
+        { 1, 1,18,18, 3, 5, 2 },         // 51
+        { 0, 2,18,18,-1, 2,17 },         // 52
+        { 0, 1,18,-1,17,18, 2 },         // 53
+        { 0, 1,17,-1, 2, 3, 6 },         // 54
+        { 0, 1,18,-2,18, 2, 5 },         // 55
+        { 1, 2,18,18,-2, 2,-1 },         // 56
+        { 0, 3,18,18, 2, 3,17 },         // 57
+        { 0, 1,17,17, 2,18, 4 },         // 58
+        { 1, 1,17,-2,17, 1, 2 },         // 59
+        { 0, 1,-1, 3, 5, 4, 7 },         // 60
+        { 0, 3,18,18, 3, 2, 5 },         // 61
+        { 0, 1,17,17, 2,18, 4 },         // 62
+        { 0, 1,18,17,-2,18, 3 },         // 63
+        { 0, 2,18,18,-2, 2,17 },         // 64
+        { 0, 3,18,17,-2, 2, 3 },         // 65
+        { 1, 1,18,18,-2, 2,17 },         // 66
+        { 0, 1,18,17, 4, 3, 1 },         // 67
+        { 1, 2, 3,18,17, 2,17 },         // 68
+        { 1, 2,18,18, 2,-2,18 },         // 69
+        { 1, 2,18,18,-1,18, 2 },         // 70
+        { 0, 2,18,18,-2, 2,17 },         // 71
+        { 1, 3,18,18, 2, 3,-2 },         // 72
+        { 0, 3,18,18, 3, 2, 5 },         // 73
+        { 0, 1,18,-2,18, 2, 5 },         // 74
+        { 1, 1,17, 3, 2, 1, 7 },         // 75
+        { 1, 3,18,18,-2, 2,18 },         // 76
+        { 1, 1,17,18,18,-2, 2 },         // 77
+        { 0, 1,18,-2,18, 2, 5 },         // 78
+        { 0, 2,18,-2,18, 2, 3 },         // 79
+        { 0, 1,-1, 3, 4, 5, 7 },         // 80
+        { 1, 1,17,17, 2,-1, 7 },         // 81
+        { 0, 1,18,-1,-1, 2,-2 },         // 82
+        { 0, 2,18,17, 2, 3,17 },         // 83
+        { 0, 1,18,17, 2,18, 2 },         // 84
+        { 0, 2,18,17,-1, 2,17 },         // 85
+        { 0, 1, 1,18, 3, 2, 5 },         // 86
+        { 0, 2,18,-2, 4,18, 2 },         // 87
+        { 1, 1,18, 3, 1, 5, 4 },         // 88
+        { 0, 1,18,17,18, 2, 5 },         // 89
+        { 1, 1,18, 3, 1, 5, 4 },         // 90
+        { 0, 4,18,18,-2, 2,18 },         // 91
+        { 1, 1,18,18, 3, 2, 5 },         // 92
+        { 1, 1,17,17, 2, 1, 4 },         // 93
+        { 0, 2,18,18,-2,18, 2 },         // 94
+        { 0, 2,18,18,-2,18, 2 },         // 95
+        { 1, 1,18,18, 2, 1, 3 },         // 96
+        { 1, 1,17,17, 2, 1, 4 },         // 97
+        { 1, 2,17,17, 2,18, 3 },         // 98
+        { 0, 1,18,17, 1, 4, 6 },         // 99
+        { 1, 2,18,18,-2, 2,-1 },         // 100
+        { 0, 1,18,-2,18, 2, 5 },         // 101
+        { 1, 1,17, 2,18, 2,17 },         // 102
+        { 0, 2,18,18,-2,18, 2 },         // 103
+        { 0, 1,18,18, 3, 6,-1 },         // 104
+        { 0, 1,18,17, 2,18, 3 },         // 105
+        { 0, 1,18,17,-2, 2,17 },         // 106
+        { 1, 1, 3,17,18, 2,17 },         // 107
+        { 1, 3,18,-3,18, 2, 3 },         // 108
+        { 1, 3,18,18,-3,18, 2 },         // 109
+        { 1, 1,18, 3, 1, 5, 4 },         // 110
+        { 0, 1,17,-2,17, 2,-3 },         // 111
+        { 1, 1,18,18, 3, 5, 2 },         // 112
+        { 1, 2,18,18,-2, 2,-1 },         // 113
+        { 0, 1,18,-1,-1, 2,-2 },         // 114
+        { 1, 1,18, 3, 1, 5, 4 },         // 115
+        { 0, 3,18,17,-1, 2,17 },         // 116
+        { 1, 3,18,17, 2,18,-2 },         // 117
+        { 0, 2,18,18,-2,18, 2 },         // 118
+        { 1, 2,18,18,-2, 2,-1 },         // 119
+        { 1, 1,18, 3, 1, 5, 4 },         // 120
+        { 0, 4, 3,18,18, 2,17 },         // 121
+        { 0, 2,18,18,-2,18, 2 },         // 122
+        { 1, 1,18,17,-1,18, 2 },         // 123
+        { 0, 2,18,18,-2,18, 2 },         // 124
+        { 0, 2,18,18,-2,18, 2 },         // 125
+        { 0, 2,18,18,-2,18, 2 },         // 126
+        { 0, 2,18,18,-2,18, 2 },         // 127
+        { 1, 1,18,18,18, 3, 2 },         // 128
+        { 0, 1,17,-1, 2, 3, 6 },         // 129
+        { 0, 1,17,-1, 2, 3, 6 },         // 130
+        { 0, 2,18,17,-2, 3, 2 },         // 131
+        { 1, 3,18,17, 2,-2,18 },         // 132
+        { 0, 2,18,18, 2,17, 3 },         // 133
+        { 0, 1,18,18, 2,18,-2 },         // 134
+        { 0, 2,18,-2, 4,18, 2 },         // 135
+        { 0, 1,-2,18, 2, 2,18 },         // 136
+        { 0, 2,18,17, 3, 6, 2 },         // 137
+        { 0, 1,18,17,18, 2, 5 },         // 138
+        { 0, 3,18,18,-2, 3, 2 },         // 139
+        { 1, 1,18,18, 2,18, 5 },         // 140
+        { 0, 1,17,-1, 2, 3, 6 },         // 141
+        { 1, 4,18,18, 2, 3,-2 },         // 142
+        { 0, 2,18,17,18, 2,-2 },         // 143
+        { 0, 1, 1,18, 3, 2, 5 },         // 144
+        { 1, 4,18,-2,18, 2, 3 },         // 145
+        { 1, 2,18, 2,18, 3,-2 },         // 146
+        { 0, 2,18,18,18, 2, 4 },         // 147
+        { 0, 2, 3,17,18, 2,17 },         // 148
+        { 1, 1,18,-1,18, 2,17 },         // 149
+        { 1, 2,17,17, 2,18, 3 },         // 150
+        { 0, 2,18,17,-2, 3, 2 },         // 151
+        { 0, 1, 1,-1,-1, 2,17 },         // 152
+        { 0, 3, 3,18,18, 2,17 },         // 153
+        { 0, 1,18,-1,17,18, 2 },         // 154
+        { 0, 1,18,17, 2,18, 3 },         // 155
+        { 0, 2,18,18,-2,18, 2 },         // 156
+        { 0, 1,18,17, 2,18, 2 },         // 157
+        { 0, 2,18,18,-2,18, 2 },         // 158
+        { 0, 2,18,18,-2,18, 2 },         // 159
+        { 1, 2,17,17, 2,18, 3 },         // 160
+        { 0, 1,18,17,-2, 2, 3 },         // 161
+        { 0, 1,18,-2,18, 2, 5 },         // 162
+        { 1, 4,18,-2,18, 2, 3 },         // 163
+        { 1, 3,18,17, 2, 3, 6 },         // 164
+        { 0, 2,18,18, 2,17, 3 },         // 165
+        { 0, 2,18,17, 2,18, 2 },         // 166
+        { 0, 2,18,18,-2,18, 2 },         // 167
+        { 1, 1,18,18, 3, 5, 2 },         // 168
+        { 0, 2,18,18,-2, 2, 3 },         // 169
+        { 1, 2,18,17, 2,17, 3 },         // 170
+        { 0, 1,18,17, 2, 3,18 },         // 171
+        { 0, 2,18,18,-2,18, 2 },         // 172
+        { 1, 4,18,18, 2, 3,-2 },         // 173
+        { 0, 1,17,-2,17, 2,-3 },         // 174
+        { 0, 1,17,17, 2,18, 4 },         // 175
+        { 1, 1,18,18,18, 2, 4 },         // 176
+        { 1, 2,18, 2,18, 3,-2 },         // 177
+        { 1, 1,18,18,-2, 2,17 },         // 178
+        { 0, 2,18,18,-2,18, 2 },         // 179
+        { 0, 2,18,18, 2,17, 3 },         // 180
+        { 0, 2,18,18,18, 2, 4 },         // 181
+        { 0, 2,18,18,-2,18, 2 },         // 182
+        { 0, 2,18,17,-2, 3, 2 },         // 183
+        { 0, 1, 1,-1,-1, 2,17 },         // 184
+        { 1, 4,18,18, 2, 3,-2 },         // 185
+        { 0, 2,18,18,-2,18, 2 },         // 186
+        { 0, 1,18,-2,18, 3, 2 },         // 187
+        { 0, 2,18,18,-2,18, 2 },         // 188
+        { 0, 2,18,18,-2,18, 2 },         // 189
+        { 0, 2,18,18,-2,18, 2 },         // 190
+        { 0, 2,18,18,-2,18, 2 },         // 191
+        { 0, 1,18,18,-2, 2,17 },         // 192
+        { 0, 3,18,17, 2, 3,17 },         // 193
+        { 1, 2,18,18, 2,-2,18 },         // 194
+        { 0, 1,-1, 3, 5, 4, 7 },         // 195
+        { 1, 1,18, 3, 1, 5, 4 },         // 196
+        { 1, 1,18,18,-2,18, 3 },         // 197
+        { 0, 2,18,17,18, 2,-2 },         // 198
+        { 0, 2,18,18, 2,17, 3 },         // 199
+        { 1, 2,18, 2,18, 3,-2 },         // 200
+        { 1, 4,18,18, 2, 3,-2 },         // 201
+        { 1, 3,18,17, 2, 3, 6 },         // 202
+        { 0, 2,18,18,-2,18, 2 },         // 203
+        { 1, 2,18,17,-2,-1,17 },         // 204
+        { 0, 1,17,-1, 2, 3, 6 },         // 205
+        { 0, 2,18,18,-2,18, 2 },         // 206
+        { 0, 2,18,18,-2, 2, 3 },         // 207
+        { 1, 1,18,18,18, 2, 5 },         // 208
+        { 0, 1,17,17,-2, 2, 3 },         // 209
+        { 0, 2,18,18,-2,18, 2 },         // 210
+        { 0, 2,18,17, 3, 6, 2 },         // 211
+        { 0, 2,18,17,18, 2, 3 },         // 212
+        { 0, 3,18,17,-3,18, 2 },         // 213
+        { 0, 1,18,18,18, 2, 3 },         // 214
+        { 0, 1,18,-2,-3, 2, 6 },         // 215
+        { 0, 2,18,18,-2,18, 2 },         // 216
+        { 1, 1,18,17,18, 2, 5 },         // 217
+        { 0, 2,18,18,-2,18, 2 },         // 218
+        { 0, 2,18,18,-2,18, 2 },         // 219
+        { 1, 1,18,17,18, 2, 5 },         // 220
+        { 0, 2,18,18,-2,18, 2 },         // 221
+        { 0, 2,18,18,-2,18, 2 },         // 222
+        { 0, 2,18,18,-2,18, 2 },         // 223
+        { 0, 1,18,18,18, 2, 3 },         // 224
+        { 1, 1,17,-2,17, 1, 2 },         // 225
+        { 1, 1,17,17, 2,-1, 7 },         // 226
+        { 0, 1,18,17, 4, 3, 1 },         // 227
+        { 1, 3,18,-3,18, 2, 3 },         // 228
+        { 0, 1, 1,18, 3, 2, 5 },         // 229
+        { 0, 2,18,18,-2,18, 2 },         // 230
+        { 0, 2,18,18,-2,18, 2 },         // 231
+        { 0, 1,18,18, 3, 6, 2 },         // 232
+        { 0, 1,17,17, 2,18, 4 },         // 233
+        { 0, 1,17,17, 2,18, 4 },         // 234
+        { 0, 2,18,18,-2,18, 2 },         // 235
+        { 0, 2,18,18,-2,18, 2 },         // 236
+        { 0, 2,18,18,-2,18, 2 },         // 237
+        { 1, 2,18,-2,18, 3, 2 },         // 238
+        { 1, 1,17,-2,17, 1, 2 },         // 239
+        { 1, 1,18,18, 3, 2, 5 },         // 240
+        { 0, 1,18,18,-1, 2, 3 },         // 241
+        { 0, 2,18,18,-2,18, 2 },         // 242
+        { 0, 2,18,18,-2,18, 2 },         // 243
+        { 0, 1,18,17,18, 2, 5 },         // 244
+        { 0, 2,18,18,-2,18, 2 },         // 245
+        { 0, 2,18,18,-2,18, 2 },         // 246
+        { 0, 2,18,18,-2,18, 2 },         // 247
+        { 0, 2,18,18,-2,18, 2 },         // 248
+        { 0, 1, 3,18,18, 2,17 },         // 249
+        { 0, 2,18,18,-2,18, 2 },         // 250
+        { 0, 2,18,18,-2,18, 2 },         // 251
+        { 0, 2,18,18,-2,18, 2 },         // 252
+        { 0, 2,18,18,-2,18, 2 },         // 253
+        { 0, 2,18,18,-2,18, 2 },         // 254
+        { 0, 2,18,18,-2,18, 2 },         // 255
+};
+
+static const WavpackDecorrSpec high_specs [] = {
+        { 1, 2,18,18,18,-2, 2, 3, 5,-1,17, 4 },  // 0
+        { 0, 1,18,17,-2, 2,18, 3, 7, 2, 5, 4 },  // 1
+        { 1, 2, 1,18, 3, 6,-2,18, 2, 3, 4, 5 },  // 2
+        { 0, 2,18,18,-2, 2,18, 3, 6, 2,17, 4 },  // 3
+        { 1, 2,18,18, 2,18, 3, 2,-1, 4,18, 5 },  // 4
+        { 1, 1, 7, 6, 5, 3, 4, 2, 5, 4, 3, 7 },  // 5
+        { 1, 1,17, 3,18, 7, 2, 6, 1, 4, 3, 5 },  // 6
+        { 1, 1,-2,18,18,18, 3,-2, 6, 5, 2, 1 },  // 7
+        { 1, 2,18,18,-1,18, 2, 3, 6,-2,17, 5 },  // 8
+        { 0, 1,17,17,18, 3, 6, 4, 5, 2,18,-2 },  // 9
+        { 1, 2, 1,18,-2, 3, 5, 2, 4,-1, 6, 1 },  // 10
+        { 0, 2,18,18, 3, 6,18, 2, 4, 8, 5, 3 },  // 11
+        { 0, 1,-2, 1,18, 2,-2, 7,18, 2,-1, 5 },  // 12
+        { 1, 1, 4, 3, 8, 1, 5, 2, 5, 6, 2, 8 },  // 13
+        { 1, 1,17,18, 2, 6, 3, 4,-1, 1, 8, 6 },  // 14
+        { 0, 1,18,18, 3, 6, 3,-2, 2, 5,-1, 1 },  // 15
+        { 0, 1,18,18,17,-1, 2,-2,18, 3, 4, 5 },  // 16
+        { 1, 2,18,17, 2,-2,18, 3, 5, 7, 2, 4 },  // 17
+        { 1, 2,18,18, 3, 6,-2,18, 2, 5, 8, 3 },  // 18
+        { 0, 1,18,17, 2,18,18, 2, 6, 5,17, 7 },  // 19
+        { 1, 2,18,17, 2,18, 3, 2, 6,18,-1, 4 },  // 20
+        { 1, 1, 5, 3, 6, 5, 3, 4, 1, 2, 4, 7 },  // 21
+        { 1, 1, 5, 3, 6, 5, 3, 4, 1, 2, 4, 7 },  // 22
+        { 0, 1,-2,18,18,18,-2, 3, 2, 4, 6, 5 },  // 23
+        { 1, 2,18,17,-3, 3,-1,18, 2, 3, 6, 5 },  // 24
+        { 0, 1,17,18, 7, 3,-2, 7, 1, 2, 4, 5 },  // 25
+        { 1, 1, 2,18,18,-2, 2, 4,-1,18, 3, 6 },  // 26
+        { 0, 3, 1,18, 4, 3, 5, 2, 4,18, 2, 3 },  // 27
+        { 0, 1,-2,18, 2,18, 3, 7,18, 2, 6,-2 },  // 28
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 29
+        { 1, 1,18,18, 5, 4, 6, 4, 5, 1, 4, 3 },  // 30
+        { 1, 1,18, 3, 6, 5, 7, 8, 2, 3, 1,-1 },  // 31
+        { 1, 1,18,18,18, 2,-2, 3, 5,18, 2, 8 },  // 32
+        { 0, 2,18,17,-2, 2, 3,18,-3, 5, 2, 7 },  // 33
+        { 1, 1, 1, 1,-1, 8,17, 3,-2, 2, 6,17 },  // 34
+        { 0, 2,18,18,17, 2,-2, 3, 2, 4,18, 5 },  // 35
+        { 1, 1,17,18, 2,-1, 5, 7,18, 3, 4, 6 },  // 36
+        { 1, 1, 5, 4, 5,17, 3, 6, 3, 4, 7, 2 },  // 37
+        { 0, 1,17, 3, 1, 7, 4, 2, 5,-2,18, 6 },  // 38
+        { 0, 1,17,18, 2,18, 4, 3, 5, 7,-3, 6 },  // 39
+        { 1, 2,17,17,-3,-2, 2, 8,18,-1, 3, 5 },  // 40
+        { 0, 1,17,17,18, 2, 3, 6,-2, 8, 1, 7 },  // 41
+        { 1, 1, 1, 2, 6,-2,18, 2, 5,-3, 7,-2 },  // 42
+        { 0, 1,18,18, 3,18, 6, 8,-2, 2, 3, 5 },  // 43
+        { 0, 1,18,17, 2,18,-2, 3, 7, 6, 2, 4 },  // 44
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 45
+        { 1, 1,18,18, 2,-1, 3, 6, 1, 3, 4, 8 },  // 46
+        { 0, 1,18,18, 3, 6, 5, 3,-2, 2,18,-1 },  // 47
+        { 0, 1,18,17,-3,18, 2, 4,-2, 3, 6,17 },  // 48
+        { 1, 3, 1, 2,17, 3,18, 7,-1, 5, 2, 4 },  // 49
+        { 1, 1,18, 3,18, 6, 8,18,-2, 5, 7, 2 },  // 50
+        { 0, 1,17, 2,18, 6, 3, 2, 5, 4, 8, 1 },  // 51
+        { 0, 1,18,17,-1, 2, 3,18,18, 2, 3,17 },  // 52
+        { 1, 1,18, 7, 6, 5, 5, 3, 1, 4, 2, 4 },  // 53
+        { 1, 1, 6,17, 3, 8, 1, 5, 7,-1, 2, 1 },  // 54
+        { 1, 1,18,-2,18, 3,-2, 2, 7, 4, 6,18 },  // 55
+        { 1, 3,18,-3,18, 2, 3,18,-1, 7, 2, 5 },  // 56
+        { 0, 2,18,-2, 7, 1, 3, 2, 4, 6,-3, 7 },  // 57
+        { 1, 1,18,-2, 2,-3,18,-2,17,-1, 4, 2 },  // 58
+        { 0, 3,17,17, 2, 5, 3, 7,18, 6, 4, 2 },  // 59
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 60
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 61
+        { 1, 1,18,17, 4, 6, 6, 4, 5, 3, 4, 1 },  // 62
+        { 0, 1,18, 5, 3, 6, 2, 3, 8, 1, 3, 7 },  // 63
+        { 1, 2,18,17,-2, 2,18, 3, 5, 7,-1, 2 },  // 64
+        { 0, 1, 1,18,18, 3, 6,-1, 4, 8, 5, 2 },  // 65
+        { 1, 1, 1, 5, 3, 4, 1, 1, 3, 5, 7, 3 },  // 66
+        { 0, 1, 3,18,18, 2,18,18,-1, 2, 3,18 },  // 67
+        { 1, 2,18,18,-1,18, 2, 3, 4, 6,18, 5 },  // 68
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 69
+        { 1, 1,18, 3, 1, 4, 5, 2, 7, 1, 3, 6 },  // 70
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 71
+        { 1, 2,18,18,-1,18, 2, 3, 5,-2, 6, 8 },  // 72
+        { 1, 1,17,18, 4, 8, 3, 2, 5, 2, 7, 6 },  // 73
+        { 1, 4, 1, 2, 5,18,-2, 2, 3, 7,-1, 4 },  // 74
+        { 0, 2,18,17,-1, 3, 6,18, 2, 3, 7, 5 },  // 75
+        { 0, 1,-2,18, 2,-3, 6,18, 4, 3,-2, 5 },  // 76
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 77
+        { 0, 1,17,17, 6, 2, 4, 8, 3, 5,-1,17 },  // 78
+        { 1, 1,18, 3,18, 6, 8,18,-2, 5, 7, 2 },  // 79
+        { 1, 2,17,17,-3, 2,18,-2, 8, 3, 6,-1 },  // 80
+        { 1, 1,18,-2,17,18, 2, 3,-2, 6, 5, 4 },  // 81
+        { 1, 2,18,17,-1, 3,18, 2, 5, 3, 6,-3 },  // 82
+        { 0, 1,18,17, 2,18, 7,18, 2, 4, 3,17 },  // 83
+        { 1, 3,18,18, 5, 6, 4, 3, 4,18, 6, 5 },  // 84
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 85
+        { 1, 1, 7, 6, 5, 3, 4, 2, 5, 4, 3, 7 },  // 86
+        { 0, 1,-2,18,18,18, 3, 6, 4, 2, 5, 2 },  // 87
+        { 0, 3,18,17,-3,18, 3, 2, 5,-1,17, 3 },  // 88
+        { 1, 1,17,18, 7, 3, 1, 7, 4, 2, 6, 5 },  // 89
+        { 1, 1,18, 2,-2,-1,18, 5, 3,-2, 1, 2 },  // 90
+        { 0, 3,18,18,-1, 3, 2, 7, 5,18, 4, 3 },  // 91
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 92
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 93
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 94
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 95
+        { 1, 1,17,18, 2,-2, 4, 8,18, 3, 6, 5 },  // 96
+        { 0, 2,18,17, 3, 5,-2, 7, 2,18, 3,-1 },  // 97
+        { 1, 1,18, 2,-2,-1,18, 5, 3,-2, 1, 2 },  // 98
+        { 0, 2, 3,17,18,18, 2, 5, 7, 6,18, 3 },  // 99
+        { 1, 1,17,18,18, 4, 3, 2,18, 7, 8,-1 },  // 100
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 101
+        { 0, 1,17, 1, 2, 3, 5, 6, 1, 4, 8,17 },  // 102
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 103
+        { 0, 2,18,17,-1,18,-3, 2, 8, 3, 6,17 },  // 104
+        { 1, 1,17,17, 1, 2, 4, 5,-1, 2, 1, 6 },  // 105
+        { 1, 1, 1, 2, 6,-2,18, 2,-3, 3,-2, 5 },  // 106
+        { 0, 1,18, 3,18, 6,18, 5, 2, 4,-1, 8 },  // 107
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 108
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 109
+        { 1, 1,18,18,-1, 2,18, 3, 6, 4,-2, 7 },  // 110
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 111
+        { 0, 2,-1,18,18,18, 2,-2, 4, 7, 2, 3 },  // 112
+        { 0, 3, 3,17,-2, 5, 2, 7,18, 6, 4, 5 },  // 113
+        { 0, 1,17, 6,18, 3, 8, 4, 5, 3, 8,18 },  // 114
+        { 0, 2,18, 2, 6, 2,18, 3, 2, 4, 5, 8 },  // 115
+        { 0, 1, 3,18,18, 2,18,-1, 2,18, 2,17 },  // 116
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 117
+        { 0, 1, 3, 6,17,-2, 5, 1, 2, 7, 4, 8 },  // 118
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 119
+        { 1, 3, 3,18,17, 5, 6, 2, 7,-2, 8,18 },  // 120
+        { 1, 1,18,-1, 3, 1, 7, 2,-1, 4, 6,17 },  // 121
+        { 1, 1,18, 2,-2,-1,18, 5, 3,-2, 1, 2 },  // 122
+        { 0, 2,18, 1, 2,18, 3, 6, 5, 2, 4, 8 },  // 123
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 124
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 125
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 126
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 127
+        { 1, 1,17,-2, 2,18,18, 8, 5, 3, 2, 6 },  // 128
+        { 0, 1,18,17, 2,18, 3, 2, 7,-2,18, 4 },  // 129
+        { 1, 2, 1,18, 2, 3,-1, 5, 6, 4, 7,17 },  // 130
+        { 0, 2,18,17, 3, 6,-2, 2, 3, 8, 5,17 },  // 131
+        { 0, 2,18,18, 3, 2,18,-1, 2, 4, 3,17 },  // 132
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 133
+        { 1, 2,17,-1,18, 2, 3,-2, 5,18, 2, 7 },  // 134
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 135
+        { 1, 2,18,-3,18, 2, 3,-2,18, 5, 6,-3 },  // 136
+        { 0, 2,18,17, 3, 5,-2, 7, 2,18, 3,-1 },  // 137
+        { 1, 1, 1,18,-1, 2, 3, 1,-2, 8, 2, 5 },  // 138
+        { 0, 1,18,18, 3, 6,18, 2, 3, 4, 8, 5 },  // 139
+        { 0, 1,-2, 1,18, 2,-2, 5, 7,18, 2,-1 },  // 140
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 141
+        { 1, 1,17,18,-1, 2, 8, 3, 4, 5, 1, 7 },  // 142
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 143
+        { 0, 2,18,18,-1, 2,18, 3,-2, 5, 4, 2 },  // 144
+        { 1, 1,18,17, 2,18, 3, 8, 5, 2, 7,17 },  // 145
+        { 0, 1,18,18, 3,18, 6, 8,-2, 2, 3, 5 },  // 146
+        { 0, 1,18,18, 2,18, 2, 6,18, 2,17, 7 },  // 147
+        { 1, 3,18,17,18, 2, 8,18, 5,-1, 3, 6 },  // 148
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 149
+        { 1, 1,18, 7, 6, 5, 5, 3, 1, 4, 2, 4 },  // 150
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 151
+        { 1, 2,18,17,-1, 3, 6,18, 2, 5, 8, 3 },  // 152
+        { 0, 1,17,18,18, 4, 7, 2, 3,-2,18, 5 },  // 153
+        { 1, 2,18, 1, 2, 6, 2, 5,18, 2, 4, 8 },  // 154
+        { 0, 4,18, 4, 1, 2, 3, 5, 4, 1, 2, 6 },  // 155
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 156
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 157
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 158
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 159
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 160
+        { 0, 2,18,17, 2,-1,18, 3,-3, 5, 2, 4 },  // 161
+        { 0, 1,17,17, 3, 6, 3, 5,-2, 2,18,-1 },  // 162
+        { 0, 2,18,18, 3,-2,18, 2,-3, 5, 3, 6 },  // 163
+        { 1, 1,17,17, 2, 4, 1, 3, 5, 2, 6,-3 },  // 164
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 165
+        { 0, 1,17, 1, 3, 2, 7, 1, 6, 3, 4, 8 },  // 166
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 167
+        { 0, 1,17,-1,18, 2, 1, 5, 3, 8,-1,-2 },  // 168
+        { 1, 1,17,18,-1, 8, 2, 5, 3, 4, 1, 6 },  // 169
+        { 1, 2, 1,18, 3,-1, 5, 1, 2, 4, 7, 6 },  // 170
+        { 0, 1,18,18, 3, 6, 5, 3,-2, 2,18,-1 },  // 171
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 172
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 173
+        { 0, 1, 1,18,-1, 3, 8, 5, 6, 1, 2, 3 },  // 174
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 175
+        { 0, 2,18,18, 2, 3, 6,18,-1, 4, 2, 3 },  // 176
+        { 1, 1, 1, 3, 5,18, 2, 6, 7, 2, 3, 1 },  // 177
+        { 1, 1, 1, 3, 8,18, 5, 2, 7, 1, 3,-2 },  // 178
+        { 0, 2,17, 2,18, 3, 6, 2, 4, 5, 8, 3 },  // 179
+        { 0, 1,18,17, 2,18, 3, 2, 7,-2,18, 4 },  // 180
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 181
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 182
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 183
+        { 1, 2,18,-3,18,-1, 3,-2, 5, 7, 1, 2 },  // 184
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 185
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 186
+        { 0, 3,18,18, 2, 6,18, 5,18, 2, 3,17 },  // 187
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 188
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 189
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 190
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 191
+        { 1, 3, 1,-1, 1, 3,-2, 2, 5, 7,-3,18 },  // 192
+        { 1, 2,18, 7, 3,-3, 2, 8, 2, 5, 4,17 },  // 193
+        { 1, 1, 1, 4, 5, 1, 3, 4, 6, 7, 8, 3 },  // 194
+        { 0, 1,18,17, 2,18,-1, 2, 3,18, 2, 4 },  // 195
+        { 0, 2,18,18,-2,18, 2, 3, 4, 7, 5,17 },  // 196
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 197
+        { 1, 1,17,18, 2, 1, 3, 2, 5, 1, 2, 3 },  // 198
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 199
+        { 0, 2,18,18,-1, 2, 3, 5, 8, 6, 1,-2 },  // 200
+        { 0, 1,17,18, 8, 3, 4, 6, 5, 2, 8, 7 },  // 201
+        { 1, 2, 1, 3,-2,18, 2, 5, 1, 7,-1,-2 },  // 202
+        { 0, 3,18,17,-1, 3,18, 2, 3, 6, 4,17 },  // 203
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 204
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 205
+        { 1, 2,18,18, 4,18, 6, 7, 8, 3,18, 2 },  // 206
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 207
+        { 0, 2,17,-3,17, 2,-2, 8, 3,18, 4,-3 },  // 208
+        { 1, 1,18,17, 3, 5, 6, 2, 8, 1, 3, 7 },  // 209
+        { 0, 1,18,18, 3, 6, 5, 3,-2, 2,18,-1 },  // 210
+        { 0, 3,18,18, 2, 6,18, 5,18, 2, 3,17 },  // 211
+        { 1, 1,18,18, 5, 4, 6, 4, 5, 1, 4, 3 },  // 212
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 213
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 214
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 215
+        { 0, 2, 3,17,18,-3, 2, 5,18, 6,-1, 7 },  // 216
+        { 1, 1,17,18, 3, 2, 5,-1, 6, 8, 4, 7 },  // 217
+        { 1, 1,18, 1,-2, 3, 2, 1, 7, 6, 3, 4 },  // 218
+        { 0, 3, 1, 2,17, 3,18, 2, 7, 5, 4,-1 },  // 219
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 220
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 221
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 222
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 223
+        { 1, 1,17,-2, 2,18,18, 8, 5, 3, 2, 6 },  // 224
+        { 0, 2,18, 5,18, 2, 3, 7,-2, 1, 6, 8 },  // 225
+        { 0, 1, 2,-1,18,-1, 2, 4,-3, 5,18, 3 },  // 226
+        { 0, 1, 3,17,18, 5, 2,18, 7, 3, 6, 5 },  // 227
+        { 1, 4, 1, 2, 5,18,-2, 2, 3, 7,-1, 4 },  // 228
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 229
+        { 0, 1, 1,18, 2, 1, 3, 4, 1, 5, 2, 7 },  // 230
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 231
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 232
+        { 0, 1,17,17,18, 2, 4, 5,18,-2, 6, 3 },  // 233
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 234
+        { 0, 2,18,18,-1, 3, 5, 6, 8,18, 2, 3 },  // 235
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 236
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 237
+        { 0, 1,18,18, 4, 6, 8,18, 7, 3, 2, 5 },  // 238
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 239
+        { 0, 2,-1,18,18,18, 2, 4,-2, 2, 3, 6 },  // 240
+        { 0, 2,18,-2, 7, 1, 3, 2, 4, 6,-3, 7 },  // 241
+        { 1, 1,17,18, 8, 3, 4, 6,-2, 5, 3, 8 },  // 242
+        { 0, 2,18, 1, 2, 6, 2, 8, 3,18, 5, 4 },  // 243
+        { 1, 1, 3,18,18, 2,18, 2,18, 3, 2,18 },  // 244
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 245
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 246
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 247
+        { 1, 1, 3,17,18, 5, 2, 6, 7, 1, 4, 8 },  // 248
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 249
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 250
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 251
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 252
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 253
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 254
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 255
+};
+
+static const WavpackDecorrSpec very_high_specs [] = {
+        { 1, 2,18,18, 2, 3,-2,18, 2, 4, 7, 5, 3, 6, 8,-1,18, 2 },        // 0
+        { 0, 1,18,18,-1,18, 2, 3, 4, 6, 5, 7,18,-3, 8, 2,-1, 3 },        // 1
+        { 1, 2, 1,18,-2, 4,18, 2, 3, 6,-1, 7, 5,-2,18, 8, 2, 4 },        // 2
+        { 0, 1,17,17, 2, 3, 4,18,-1, 5, 6, 7,18, 2, 8,17, 3,-2 },        // 3
+        { 1, 1,18,18, 2,18, 3, 2,18, 4,-1, 3,18, 2, 6, 8,17, 5 },        // 4
+        { 0, 2,18,17, 2, 3,-2, 5,18,-3, 2, 4, 7, 3, 6, 8, 5,17 },        // 5
+        { 1, 1,18,-2, 2,-3,18, 5,-2,18, 2, 3, 6, 2,17, 4, 7,-1 },        // 6
+        { 1, 1,17, 8,18, 3,-2, 2, 5, 4,18, 6, 3, 8, 7, 2, 5, 4 },        // 7
+        { 0, 2,18,17,-2, 2,18, 3, 2, 5,-3, 4, 7,18, 3, 8, 6, 2 },        // 8
+        { 1, 1, 3, 6, 5, 5, 1, 3, 7, 4, 2, 6, 4,18, 3, 7, 5, 6 },        // 9
+        { 1, 2, 1,18, 3, 2,-2, 1, 5, 4, 6, 2, 7, 1, 8, 3,-1, 1 },        // 10
+        { 0, 1,18,18, 2, 3, 6, 3, 5,-2, 2, 4,18, 3,-2,-1, 6, 7 },        // 11
+        { 0, 1,-2,18, 2,18, 7, 2, 6,-2, 3, 4,18,18, 2,-3, 8, 5 },        // 12
+        { 0, 2,18,18,18, 2, 4, 3,18, 5, 3, 6,-2, 2, 4,18, 8, 7 },        // 13
+        { 0, 1,-2, 1,18, 2,-2,18,-1, 5, 7, 2, 3, 4,18, 2, 6, 2 },        // 14
+        { 1, 1,17,18, 3, 2, 1, 7,-1, 2, 4, 3, 5, 6,-2,18, 7, 8 },        // 15
+        { 1, 1,18,18, 2,18, 3, 4, 6,-2,18, 5, 8, 2, 3, 7, 4,-1 },        // 16
+        { 0, 1,18,18,18,-1, 2, 3, 4, 6, 8,18, 3, 5, 2, 6, 7, 4 },        // 17
+        { 1, 1,17,-2,18,18, 2, 5, 3, 8, 2,-1, 6, 1, 3, 4, 7, 5 },        // 18
+        { 0, 1,17,17,18, 2, 3, 6,-2, 8, 1, 7, 5, 2, 3, 1, 4, 8 },        // 19
+        { 1, 1,17,17, 3, 2, 7, 1, 4, 3, 6, 2, 5,-2, 8, 7,18, 6 },        // 20
+        { 0, 1,18,17,-2, 2,18, 3,-3, 7, 6, 5, 2, 4,-1, 8, 3,17 },        // 21
+        { 1, 1, 2,18,18,-2, 2, 4,-1, 5,18, 3, 8, 6, 2, 7,17, 4 },        // 22
+        { 0, 1,17, 3, 6, 8, 5, 4, 3, 8, 1,18, 7, 2, 4, 5, 6, 3 },        // 23
+        { 1, 2,17,18, 4, 8, 3, 2, 5, 7, 6, 8, 2, 7,-2,18, 3, 4 },        // 24
+        { 1, 1, 6, 5, 5, 3, 4, 7, 3, 2, 4, 6, 3, 7, 1, 5, 2, 4 },        // 25
+        { 1, 1, 1,18,-1, 2, 1, 3, 8,-2, 2, 5, 6, 3, 8, 7,18, 4 },        // 26
+        { 0, 1, 1,17,-1,18, 3, 2, 5, 4, 6, 7, 8, 3, 4, 2, 1,-2 },        // 27
+        { 0, 1,18, 2,18,18, 2,18, 6,-2,18, 7, 5, 4, 3, 2,18,-2 },        // 28
+        { 0, 3, 1, 4,18, 3, 2, 4, 1, 5, 2, 3, 6,18, 8, 7, 2, 4 },        // 29
+        { 0, 1,17,-2, 1,-3, 2,18, 3,-2, 4,18, 3, 6, 7,-3, 2, 8 },        // 30
+        { 1, 1,17,18,18, 4, 2, 3, 7, 6,18, 8, 5,-1, 4, 2, 3,17 },        // 31
+        { 1, 2,18,-1,17,18, 2, 3,-2,18, 5, 8, 2, 4, 3, 7, 6,-1 },        // 32
+        { 1, 1,18,18,18,-2, 4, 2, 3,18, 5, 8, 2, 4, 6, 7,-2, 3 },        // 33
+        { 1, 2,18,18,-2,18,-1, 3, 2, 5,18,-2, 7, 2, 3, 4, 6, 8 },        // 34
+        { 0, 1,17,18,-1, 2, 4,18, 8, 3, 6, 5, 7,-3, 2, 4, 3,17 },        // 35
+        { 1, 1,18,18,17, 2,-1,18, 3, 2,18, 6, 5, 4,18, 7, 2,-1 },        // 36
+        { 0, 2, 1,18,-1,18, 3, 2, 4, 6,-3, 7,-1, 5, 1, 2, 3, 8 },        // 37
+        { 1, 1, 1,17,-2, 2,-3, 6, 3, 5, 1, 2, 7, 6, 8,-2, 4, 1 },        // 38
+        { 0, 1,17,-1, 5, 1, 4, 3, 6, 2,-2,18, 3, 2, 4, 5, 8,-1 },        // 39
+        { 0, 2,18,18,17, 2, 3,-2, 5,18, 2, 4, 7, 8, 6,17, 3, 5 },        // 40
+        { 1, 1, 1, 5, 1, 3, 4, 3, 7, 5, 1, 3, 6, 1, 2, 4, 3, 8 },        // 41
+        { 1, 2, 1,-1, 3, 2,18, 7,-2, 5, 2, 6, 4, 3,-1,18, 8, 7 },        // 42
+        { 0, 2,18,17, 3,18, 2, 5, 4, 3, 6, 2, 7, 8,18, 3, 4, 5 },        // 43
+        { 1, 1, 3, 6,17, 8, 7, 5,18,-1, 1, 2, 3, 4, 2, 6, 8, 1 },        // 44
+        { 0, 2,18,18, 3,-3,18, 2, 6, 5, 3, 7,18, 4,-2, 8, 2, 3 },        // 45
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 46
+        { 1, 1,17, 1, 7, 2, 3,18,-2, 3, 6, 4, 2, 7, 8, 5, 3,17 },        // 47
+        { 1, 1, 3, 6, 5, 5, 1, 3, 7, 4, 2, 6, 4,18, 3, 7, 5, 6 },        // 48
+        { 0, 1,18,18,18, 2, 4,-1,18, 8,-1, 2, 3, 4, 6,-2, 1, 7 },        // 49
+        { 1, 1,18,-2,17,18, 2, 6, 3,-2, 5, 4, 7, 1,-3, 8, 2, 6 },        // 50
+        { 0, 1,17,18,18, 4, 2, 7, 3, 6,-2,18, 8, 4, 5, 2, 7,17 },        // 51
+        { 1, 1,18,18, 5, 4, 6, 4, 1, 5, 4, 3, 2, 5, 6, 1, 4, 5 },        // 52
+        { 0, 1,18,18,-2,18, 2,-3, 3, 8, 5,18, 6, 4, 3,-1, 7, 2 },        // 53
+        { 1, 1,18, 2,-2,-3,18, 5, 2, 3,-2, 4, 6, 1,-3, 2, 7, 8 },        // 54
+        { 0, 1,18, 3, 5, 8, 2, 6, 7, 3, 1, 5, 2,-1, 8, 6, 7, 4 },        // 55
+        { 1, 1, 4, 3, 8, 1, 5, 6, 2, 5, 8,-2, 2, 7, 3,18, 5, 4 },        // 56
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 57
+        { 1, 1,17, 3,18,18, 7, 2, 4,18, 6, 2, 3,-1, 8, 5,18,-3 },        // 58
+        { 0, 1, 3,17,18, 2,18, 6, 7,-3,18, 2, 5, 6, 3, 8, 7,-1 },        // 59
+        { 1, 1,18,18, 2,18,18, 2,-1, 7, 3,18, 5, 2, 6, 4,-1,18 },        // 60
+        { 0, 3,18, 3, 4, 1, 5, 2,18, 4, 2, 3,18, 7, 6, 1, 2, 4 },        // 61
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 62
+        { 1, 1,17, 1,18, 2, 3, 6, 4, 5, 7,18, 3, 8, 2, 4,-2,17 },        // 63
+        { 1, 2,18,17, 2, 3, 5,18, 6,-2, 7, 3, 2, 4,18, 8,-1, 5 },        // 64
+        { 0, 2, 1,18,-1,18, 3, 2, 4, 6,-3, 7,-1, 5, 1, 2, 3, 8 },        // 65
+        { 1, 1, 1,18,-1, 8, 2, 6, 3,-2, 1, 2, 5, 4,-3, 8, 6, 3 },        // 66
+        { 0, 1,18,18, 2,18, 2,18, 7, 6,18, 2,-2, 3, 5, 4,18, 8 },        // 67
+        { 1, 2,18,17, 2, 3,18,-1, 2, 3, 6,18, 5, 4, 3, 7, 2, 8 },        // 68
+        { 1, 2,18,18, 3,-2, 4,18, 5, 7, 6, 2, 4,-3, 8, 5,18, 3 },        // 69
+        { 1, 1,17,-2,18,18, 2, 5, 3, 8, 2,-1, 6, 1, 3, 4, 7, 5 },        // 70
+        { 1, 1, 3,17,18, 5, 7, 2, 4, 6, 1, 8,-1, 3, 7, 4, 1, 2 },        // 71
+        { 0, 2, 1,-2, 2,18, 3, 5, 2, 4, 7,-1, 2, 3, 5,18,-2, 4 },        // 72
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 73
+        { 1, 1, 1, 2,-2, 6,18,-3, 2, 7, 3,-2, 5, 6, 1, 8, 2, 4 },        // 74
+        { 0, 1,18,18,18, 3,-2, 6,18, 2, 4, 3, 5, 8, 7, 6, 2,-2 },        // 75
+        { 1, 1, 1, 5, 1, 3, 4, 3, 7, 5, 1, 3, 6, 1, 2, 4, 3, 8 },        // 76
+        { 0, 1, 3,17,18, 2, 5,18, 6, 7, 5,-2, 2, 4,18, 3, 6, 8 },        // 77
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 78
+        { 0, 2,17,-1,18, 2, 4,-1, 8, 3,18, 7,-3, 4, 5, 1, 2,-2 },        // 79
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 8, 6, 4, 5, 7,-1 },        // 80
+        { 1, 1,18,18, 3, 6, 4, 8,-2, 2, 5, 3, 7,18, 6, 8, 4, 2 },        // 81
+        { 1, 1,17,18,18,-2, 5, 2, 3, 1, 4,-1, 8, 6, 5, 3, 2,18 },        // 82
+        { 1, 1,17,17, 1, 2, 4, 5, 2, 6,-1, 3, 1, 1,-2, 4, 2, 7 },        // 83
+        { 1, 1,17, 1, 7, 2, 3,18,-2, 3, 6, 4, 2, 7, 8, 5, 3,17 },        // 84
+        { 0, 1,18,17,-2,-3, 1, 2, 3, 2, 5, 4, 7,-3, 6,-2, 2, 1 },        // 85
+        { 1, 1, 1, 3, 5,18, 1, 2, 7, 3, 6, 2, 5, 8,-1, 1, 4, 7 },        // 86
+        { 1, 1,17, 3, 6, 8, 1, 4, 5, 3,-2, 7, 2, 8, 5, 6,18, 3 },        // 87
+        { 1, 1,17,18, 2, 4, 8,-2, 3, 1, 5, 6, 7, 1, 2, 3, 4, 7 },        // 88
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 89
+        { 1, 1, 3, 1, 8,18, 5, 2, 3,18, 6, 7,-2, 4, 3, 2, 8,18 },        // 90
+        { 0, 1,18,17, 2,18, 3, 4,-1,18, 7, 6, 2, 8, 4,18,18, 5 },        // 91
+        { 0, 1,18,18, 2,18,18, 2, 7,-2, 6, 5, 4, 3,18, 3, 2,17 },        // 92
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 93
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 94
+        { 1, 1,17, 8,18, 3, 2, 1, 5, 4, 6,-1, 3,-3, 8,18, 7, 2 },        // 95
+        { 1, 2,18,17,18, 2, 3, 5,-2,18, 6,-1, 2, 3, 7, 4, 8,17 },        // 96
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 8, 6, 4, 5, 7,-1 },        // 97
+        { 1, 2,18,18,-2,17, 2,18, 3, 4,18, 8, 7,-1, 2, 4, 5,17 },        // 98
+        { 0, 2,17,-3,17, 3, 2,-2,18, 8, 4,-3, 2,18, 5, 3,-2, 6 },        // 99
+        { 0, 1,18,18, 2,18,18, 2, 7,-2, 6, 5, 4, 3,18, 3, 2,17 },        // 100
+        { 0, 2, 1,18,-1, 3, 5, 2,-3,18, 7, 3,-1, 6, 4, 2,17, 5 },        // 101
+        { 1, 1,17,-2,17, 2,-3, 1, 5,-1, 4, 6, 3, 2, 8, 7,-2, 5 },        // 102
+        { 1, 1, 1,18, 1, 3, 5, 8, 6, 2, 3,-1, 7, 1, 4, 8, 5,-3 },        // 103
+        { 0, 2, 3,18,18, 2,18,-2, 6, 5, 7, 2, 4,18, 3, 6,-3, 5 },        // 104
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 105
+        { 1, 1, 3, 6,17, 8, 7, 5,18,-1, 1, 2, 3, 4, 2, 6, 8, 1 },        // 106
+        { 0, 4,18, 2,17, 3,18,-2, 2, 6,18, 2, 7, 3, 5, 4, 8,18 },        // 107
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 108
+        { 0, 1,18,18, 2, 3, 6, 3, 5,-2, 2, 4,18, 3,-2,-1, 6, 7 },        // 109
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 110
+        { 1, 1,17, 1, 2, 5, 3,-2, 1, 4, 3, 7, 6,-3, 2, 1, 1, 2 },        // 111
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 112
+        { 1, 1,18,18,-2,18,-2, 2, 3, 6,18, 4,-1, 2, 3, 8, 1, 4 },        // 113
+        { 1, 1,17,-2,17, 2,-3, 1, 5,-1, 4, 6, 3, 2, 8, 7,-2, 5 },        // 114
+        { 0, 1,17,17,18, 3, 2,18,18, 6, 8, 2,-2, 3, 5, 4,17,18 },        // 115
+        { 1, 1, 1, 5, 1, 3, 4, 3, 7, 5, 1, 3, 6, 1, 2, 4, 3, 8 },        // 116
+        { 1, 1, 1, 3,-3,18,18, 6, 5,18, 2,-1, 3, 8, 7,-3, 4,17 },        // 117
+        { 1, 1,18, 1, 2, 1, 3, 8, 7, 4, 1, 5, 2,-1,-3,18, 6, 2 },        // 118
+        { 0, 1,18, 3, 5, 2, 6, 8,18, 5, 7, 2, 3,-1, 6, 7, 8, 5 },        // 119
+        { 0, 2,18, 3,-2, 7, 8, 2, 5, 4,-3, 8, 3, 2,18, 5, 4, 6 },        // 120
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 121
+        { 1, 3, 1, 1, 2, 5, 2, 7, 4, 3,-1,18,-2, 8, 2, 1, 6, 7 },        // 122
+        { 0, 1, 3,17,18, 5, 2, 6, 7,18, 4, 5, 3, 6,18, 2, 7, 8 },        // 123
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 124
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 125
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 126
+        { 0, 1, 1,18, 1, 2, 3, 5, 1, 2, 6, 7, 4, 3, 8, 1,17, 5 },        // 127
+        { 1, 2,17,-1,18,-2, 2, 3, 5,18, 2, 4, 6, 7, 3,-1, 5, 8 },        // 128
+        { 1, 1,18,18,-3,18,-2, 2, 3,-2,18, 6, 4, 5, 8, 3,17,-3 },        // 129
+        { 1, 1,18, 7, 6, 5, 5, 3, 1, 4, 2, 7, 3, 4,-3, 6,18, 8 },        // 130
+        { 0, 2,18,18, 2, 3, 5,18, 2, 4, 3, 6,18, 7, 8,-1, 5, 2 },        // 131
+        { 0, 1,18,17,-1, 2,18, 3, 2,18, 4, 3,18, 2, 6, 5, 8,17 },        // 132
+        { 0, 2,18,17, 2, 3,18, 5,-1, 6, 7, 8, 2, 3, 4, 5,18, 6 },        // 133
+        { 1, 2,18,-3,18, 2, 3,-2,-3, 5,18, 7, 6, 2, 4, 3, 8,-2 },        // 134
+        { 1, 1,17,18,18,-2, 2, 3, 5, 4, 8,18,-1, 5, 3, 6,-2, 7 },        // 135
+        { 1, 2,18,17, 2,-2,18, 3,-1, 4,18, 2, 7, 5, 3, 8, 6, 4 },        // 136
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 137
+        { 1, 1, 1, 5, 1, 3, 4, 3, 7, 5, 1, 3, 6, 1, 2, 4, 3, 8 },        // 138
+        { 0, 2,18,18, 3, 3,-2, 2, 5,18, 6, 3,-1, 4, 7,-1, 1, 2 },        // 139
+        { 0, 1,-2, 1,18, 2,-2, 5, 7,18, 3, 2, 6, 2,-1, 4,-2,17 },        // 140
+        { 0, 2,18,18,18, 2, 3,-2,18, 5, 4, 2, 6, 8, 3,-2, 4,18 },        // 141
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 142
+        { 1, 1,17,18,-1, 3, 2, 5, 1, 3, 2, 8, 4, 7, 6, 2,-1, 5 },        // 143
+        { 1, 1,17,18,18, 4, 2, 3, 7, 6,18, 8, 5,-1, 4, 2, 3,17 },        // 144
+        { 0, 1,18,18,-2,18, 2, 3, 4, 5, 6,18, 8, 2, 3, 7,-2, 4 },        // 145
+        { 0, 1,18,-2,18,18,-3,-2, 2, 3, 5, 8, 1, 2, 6, 4, 7,-1 },        // 146
+        { 0, 1,18,17, 2,18, 3,-2, 2, 7, 6, 4,18, 3, 8, 7, 4, 2 },        // 147
+        { 1, 1,17,18,18, 4, 2, 3, 7, 6,18, 8, 5,-1, 4, 2, 3,17 },        // 148
+        { 1, 1,18,17,18, 2, 5, 3,-2,18, 6, 2, 3, 4, 8, 7, 5,-1 },        // 149
+        { 0, 1, 2,-1,18,-1, 2, 4,-3,18, 5, 3, 6,18, 2, 4, 7, 8 },        // 150
+        { 1, 1,17,18, 8, 3, 6, 4,-1, 5, 2, 7, 3, 8, 6, 5,18, 4 },        // 151
+        { 0, 2,18, 3,-2, 7, 8, 2, 5, 4,-3, 8, 3, 2,18, 5, 4, 6 },        // 152
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 153
+        { 1, 1, 1,18,-1, 8, 2, 6, 3,-2, 1, 2, 5, 4,-3, 8, 6, 3 },        // 154
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 155
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 156
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 157
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 158
+        { 0, 1,17,18,18, 4, 2, 7, 3, 6,-2,18, 8, 4, 5, 2, 7,17 },        // 159
+        { 1, 2,18,-1,18, 3,-2,18, 2, 5, 3, 6, 7, 2,-1,18, 8, 4 },        // 160
+        { 1, 2, 1,18,-2, 4,18, 2, 3, 6,-1, 7, 5,-2,18, 8, 2, 4 },        // 161
+        { 1, 2, 1,18,-3, 2, 3,18,-1, 5, 6, 2, 8, 3, 4, 1,-2, 7 },        // 162
+        { 0, 1, 1,17,-1,18, 3, 2, 5, 4, 6, 7, 8, 3, 4, 2, 1,-2 },        // 163
+        { 1, 1,18,17,18, 4, 3, 5, 1, 2, 6, 3, 4, 7, 1, 8, 5, 2 },        // 164
+        { 0, 1,18,-2, 7, 1, 3, 2,-3, 4, 6,-2, 7, 8, 1, 5, 4, 3 },        // 165
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 166
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 167
+        { 0, 2,18,18,18,-2, 2, 5, 3, 7,18, 2, 4,-3, 5, 6, 3, 8 },        // 168
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 169
+        { 0, 3, 3,18,-1, 5, 2, 7,18, 6, 5, 2, 4, 3,-1, 7,18, 6 },        // 170
+        { 0, 2,18,18,18, 4, 3, 2, 6, 4, 8,18, 5, 3, 2, 7,-2, 6 },        // 171
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 172
+        { 0, 2,18,18,18, 2, 3,-2,18, 5, 4, 2, 6, 8, 3,-2, 4,18 },        // 173
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 174
+        { 1, 1,17, 8,18, 3, 2, 1, 5, 4, 6,-1, 3,-3, 8,18, 7, 2 },        // 175
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 176
+        { 0, 1,-1,18,18,18, 2, 4, 6,-2, 2, 8, 3, 4,18, 7,-1, 6 },        // 177
+        { 0, 1,18, 1,-2, 2, 4, 1, 3,-1, 2, 5, 7, 1, 6, 8,-2,17 },        // 178
+        { 0, 1,17,17,18, 2, 5, 4,18, 3, 8, 7, 4, 6, 8, 1, 5, 2 },        // 179
+        { 1, 2,18,18, 5, 4, 6, 3, 4,18, 8, 4,-1, 7, 5, 3, 6, 2 },        // 180
+        { 0, 1,18,18,-3,18, 3, 6, 2, 5, 7,18, 3, 8,-1, 4, 5, 2 },        // 181
+        { 1, 1,18, 2,-2,-3,18, 5, 2,-2, 4, 3, 6,18, 8,-1, 2, 7 },        // 182
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 183
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 184
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 185
+        { 1, 1,17, 1, 7, 2, 3,18,-2, 3, 6, 4, 2, 7, 8, 5, 3,17 },        // 186
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 187
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 188
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 189
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 190
+        { 0, 1,17,18, 3,18, 2, 5, 4, 7,-3, 6, 3, 2,18, 4, 7, 3 },        // 191
+        { 1, 1, 1, 7, 4, 5, 3, 4, 5, 1, 3, 6, 3, 2, 4, 8,-2, 7 },        // 192
+        { 0, 1, 1,18,-1,-2,18, 3, 2,-1, 6, 7, 4, 5, 3,18, 2,-3 },        // 193
+        { 1, 1,18,18,-1, 3, 6,18, 5, 4, 8, 2, 3, 6,18, 7, 4,-2 },        // 194
+        { 0, 2,18,18, 2, 6,18, 2,18, 5, 3,18, 2, 4, 7, 8, 3,18 },        // 195
+        { 1, 1, 3,18,18, 5,18, 6, 2, 4, 7,-2,18, 5, 8, 6, 3, 2 },        // 196
+        { 0, 1,18,-2, 7, 1, 3, 2,-3, 4, 6,-2, 7, 8, 1, 5, 4, 3 },        // 197
+        { 1, 1,18,-2,18, 2, 5,18, 3,-2, 4, 7, 2,-1, 8, 6, 5, 1 },        // 198
+        { 1, 1,17,17, 5,18, 4, 1, 2, 8, 6, 4,-2, 3, 5,-1, 1, 8 },        // 199
+        { 0, 2, 1, 2,17, 3, 7,18, 2,-1, 4, 5,18, 2, 7, 3, 6, 8 },        // 200
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 201
+        { 1, 1, 3, 6,17, 8, 7, 5,18,-1, 1, 2, 3, 4, 2, 6, 8, 1 },        // 202
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 203
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 204
+        { 0, 2,18,18,18, 2,-2, 3, 6, 4, 8,18, 2, 5, 7, 4, 3, 6 },        // 205
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 206
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 207
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 208
+        { 1, 1,18, 1, 8, 3, 5, 6, 4,-1, 8, 3, 7,18, 2, 5, 8, 4 },        // 209
+        { 1, 1,17,18, 5, 2, 4, 3, 1, 6,-2, 1, 3, 2, 4, 5,-1,17 },        // 210
+        { 1, 1,18,17, 2,18, 3,-3, 7, 2, 6, 4, 3, 5,18, 8, 2,-2 },        // 211
+        { 1, 1,18,17,18, 4, 3, 5,-1,18, 2, 7, 8, 4, 6, 3,18, 5 },        // 212
+        { 0, 1,18,17,18,-2, 2,-3, 3, 4, 8, 5, 2,18, 6, 3, 7,-2 },        // 213
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 214
+        { 1, 1,17,18, 8, 3, 4, 6,18, 5,-2, 3, 8, 5, 2, 4, 7, 6 },        // 215
+        { 0, 1,18,-2, 3, 5, 1, 7, 3, 2, 6,-3, 4, 1, 5, 8, 3,-2 },        // 216
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 217
+        { 1, 1, 3,17,18, 5,-1,18, 2, 6, 7,18, 5, 3,-3,-1, 6, 2 },        // 218
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 219
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 220
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 221
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 222
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 223
+        { 1, 3,18,17,-2, 3,-1,18, 2, 5, 3, 7, 6, 2, 4, 8,18, 5 },        // 224
+        { 0, 1,18,-1,18, 2,18, 3, 5,18, 2, 8,18, 5, 4,-1, 6, 2 },        // 225
+        { 1, 2,18,-2,18,18, 2, 3, 4,-3, 2, 5,18, 7, 4, 3, 8, 6 },        // 226
+        { 0, 2,17,-1,18, 2,-1, 1, 7, 3, 8, 5,-2, 4, 1, 2,-3, 6 },        // 227
+        { 0, 1,18,17, 2,18, 2,18, 6, 7, 4, 3,18, 5, 2,-2,17, 8 },        // 228
+        { 0, 3,18,17, 2, 3,-3,-1,18, 2, 4, 5,18, 7, 3, 2,-3, 6 },        // 229
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 230
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 231
+        { 0, 2, 3,18,18,18, 2, 6, 5,18, 7, 2, 4, 6,18, 5, 3, 8 },        // 232
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 233
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 234
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 235
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 236
+        { 0, 1,18,18, 3, 6, 3,-2, 2,18, 5,-1, 7, 3, 4,-2, 2, 6 },        // 237
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 238
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 239
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 240
+        { 1, 1,18,17,18,18,-2, 2, 3,-3,18, 6, 4, 2,-2, 8, 3, 7 },        // 241
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 242
+        { 0, 1,18,18,18, 4, 2, 7, 8,18, 3, 2,-2, 4, 7, 6,17, 5 },        // 243
+        { 1, 1,18,18,-1,-2, 8, 3,18, 6, 3, 5, 8, 2, 4, 7, 1, 6 },        // 244
+        { 1, 1, 1,-3, 3,18,18, 2,-1, 3, 6, 5,18, 4, 7,-2, 8, 3 },        // 245
+        { 1, 1, 1,18, 4, 2, 5,18, 1, 3,-1, 6, 1, 4, 8, 2, 5, 1 },        // 246
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 247
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 248
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 249
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 250
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 251
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 252
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 253
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 254
+        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 255
+};
+
+#define NUM_FAST_SPECS (sizeof (fast_specs) / sizeof (fast_specs [0]))
+#define NUM_DEFAULT_SPECS (sizeof (default_specs) / sizeof (default_specs [0]))
+#define NUM_HIGH_SPECS (sizeof (high_specs) / sizeof (high_specs [0]))
+#define NUM_VERY_HIGH_SPECS (sizeof (very_high_specs) / sizeof (very_high_specs [0]))
diff --git a/third_party/wavpack/src/decorr_utils.c b/third_party/wavpack/src/decorr_utils.c
new file mode 100644
index 0000000..a76b14c
--- /dev/null
+++ b/third_party/wavpack/src/decorr_utils.c
@@ -0,0 +1,204 @@
+////////////////////////////////////////////////////////////////////////////
+//                           **** WAVPACK ****                            //
+//                  Hybrid Lossless Wavefile Compressor                   //
+//              Copyright (c) 1998 - 2013 Conifer Software.               //
+//                          All Rights Reserved.                          //
+//      Distributed under the BSD Software License (see license.txt)      //
+////////////////////////////////////////////////////////////////////////////
+
+// decorr_utils.c
+
+// This module contains the functions that process metadata blocks that are
+// specific to the decorrelator. These would be called any time a WavPack
+// block was parsed. These are in a module separate from the actual unpack
+// decorrelation code (unpack.c) so that if an application just wants to get
+// information from WavPack files (rather than actually decoding audio) then
+// less code needs to be linked.
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "wavpack_local.h"
+
+///////////////////////////// executable code ////////////////////////////////
+
+// Read decorrelation terms from specified metadata block into the
+// decorr_passes array. The terms range from -3 to 8, plus 17 & 18;
+// other values are reserved and generate errors for now. The delta
+// ranges from 0 to 7 with all values valid. Note that the terms are
+// stored in the opposite order in the decorr_passes array compared
+// to packing.
+
+int read_decorr_terms (WavpackStream *wps, WavpackMetadata *wpmd)
+{
+    int termcnt = wpmd->byte_length;
+    unsigned char *byteptr = (unsigned char *)wpmd->data;
+    struct decorr_pass *dpp;
+
+    if (termcnt > MAX_NTERMS)
+        return FALSE;
+
+    wps->num_terms = termcnt;
+
+    for (dpp = wps->decorr_passes + termcnt - 1; termcnt--; dpp--) {
+        dpp->term = (int)(*byteptr & 0x1f) - 5;
+        dpp->delta = (*byteptr++ >> 5) & 0x7;
+
+        if (!dpp->term || dpp->term < -3 || (dpp->term > MAX_TERM && dpp->term < 17) || dpp->term > 18 ||
+            ((wps->wphdr.flags & MONO_DATA) && dpp->term < 0))
+                return FALSE;
+    }
+
+    return TRUE;
+}
+
+// Read decorrelation weights from specified metadata block into the
+// decorr_passes array. The weights range +/-1024, but are rounded and
+// truncated to fit in signed chars for metadata storage. Weights are
+// separate for the two channels and are specified from the "last" term
+// (first during encode). Unspecified weights are set to zero.
+
+int read_decorr_weights (WavpackStream *wps, WavpackMetadata *wpmd)
+{
+    int termcnt = wpmd->byte_length, tcount;
+    char *byteptr = (char *)wpmd->data;
+    struct decorr_pass *dpp;
+
+    if (!(wps->wphdr.flags & MONO_DATA))
+        termcnt /= 2;
+
+    if (termcnt > wps->num_terms)
+        return FALSE;
+
+    for (tcount = wps->num_terms, dpp = wps->decorr_passes; tcount--; dpp++)
+        dpp->weight_A = dpp->weight_B = 0;
+
+    while (--dpp >= wps->decorr_passes && termcnt--) {
+        dpp->weight_A = restore_weight (*byteptr++);
+
+        if (!(wps->wphdr.flags & MONO_DATA))
+            dpp->weight_B = restore_weight (*byteptr++);
+    }
+
+    return TRUE;
+}
+
+// Read decorrelation samples from specified metadata block into the
+// decorr_passes array. The samples are signed 32-bit values, but are
+// converted to signed log2 values for storage in metadata. Values are
+// stored for both channels and are specified from the "last" term
+// (first during encode) with unspecified samples set to zero. The
+// number of samples stored varies with the actual term value, so
+// those must obviously come first in the metadata.
+
+int read_decorr_samples (WavpackStream *wps, WavpackMetadata *wpmd)
+{
+    unsigned char *byteptr = (unsigned char *)wpmd->data;
+    unsigned char *endptr = byteptr + wpmd->byte_length;
+    struct decorr_pass *dpp;
+    int tcount;
+
+    for (tcount = wps->num_terms, dpp = wps->decorr_passes; tcount--; dpp++) {
+        CLEAR (dpp->samples_A);
+        CLEAR (dpp->samples_B);
+    }
+
+    if (wps->wphdr.version == 0x402 && (wps->wphdr.flags & HYBRID_FLAG)) {
+        if (byteptr + (wps->wphdr.flags & MONO_DATA ? 2 : 4) > endptr)
+            return FALSE;
+
+        wps->dc.error [0] = wp_exp2s ((int16_t)(byteptr [0] + (byteptr [1] << 8)));
+        byteptr += 2;
+
+        if (!(wps->wphdr.flags & MONO_DATA)) {
+            wps->dc.error [1] = wp_exp2s ((int16_t)(byteptr [0] + (byteptr [1] << 8)));
+            byteptr += 2;
+        }
+    }
+
+    while (dpp-- > wps->decorr_passes && byteptr < endptr)
+        if (dpp->term > MAX_TERM) {
+            if (byteptr + (wps->wphdr.flags & MONO_DATA ? 4 : 8) > endptr)
+                return FALSE;
+
+            dpp->samples_A [0] = wp_exp2s ((int16_t)(byteptr [0] + (byteptr [1] << 8)));
+            dpp->samples_A [1] = wp_exp2s ((int16_t)(byteptr [2] + (byteptr [3] << 8)));
+            byteptr += 4;
+
+            if (!(wps->wphdr.flags & MONO_DATA)) {
+                dpp->samples_B [0] = wp_exp2s ((int16_t)(byteptr [0] + (byteptr [1] << 8)));
+                dpp->samples_B [1] = wp_exp2s ((int16_t)(byteptr [2] + (byteptr [3] << 8)));
+                byteptr += 4;
+            }
+        }
+        else if (dpp->term < 0) {
+            if (byteptr + 4 > endptr)
+                return FALSE;
+
+            dpp->samples_A [0] = wp_exp2s ((int16_t)(byteptr [0] + (byteptr [1] << 8)));
+            dpp->samples_B [0] = wp_exp2s ((int16_t)(byteptr [2] + (byteptr [3] << 8)));
+            byteptr += 4;
+        }
+        else {
+            int m = 0, cnt = dpp->term;
+
+            while (cnt--) {
+                if (byteptr + (wps->wphdr.flags & MONO_DATA ? 2 : 4) > endptr)
+                    return FALSE;
+
+                dpp->samples_A [m] = wp_exp2s ((int16_t)(byteptr [0] + (byteptr [1] << 8)));
+                byteptr += 2;
+
+                if (!(wps->wphdr.flags & MONO_DATA)) {
+                    dpp->samples_B [m] = wp_exp2s ((int16_t)(byteptr [0] + (byteptr [1] << 8)));
+                    byteptr += 2;
+                }
+
+                m++;
+            }
+        }
+
+    return byteptr == endptr;
+}
+
+// Read the shaping weights from specified metadata block into the
+// WavpackStream structure. Note that there must be two values (even
+// for mono streams) and that the values are stored in the same
+// manner as decorrelation weights. These would normally be read from
+// the "correction" file and are used for lossless reconstruction of
+// hybrid data.
+
+int read_shaping_info (WavpackStream *wps, WavpackMetadata *wpmd)
+{
+    if (wpmd->byte_length == 2) {
+        char *byteptr = (char *)wpmd->data;
+
+        wps->dc.shaping_acc [0] = (int32_t) restore_weight (*byteptr++) << 16;
+        wps->dc.shaping_acc [1] = (int32_t) restore_weight (*byteptr++) << 16;
+        return TRUE;
+    }
+    else if (wpmd->byte_length >= (wps->wphdr.flags & MONO_DATA ? 4 : 8)) {
+        unsigned char *byteptr = (unsigned char *)wpmd->data;
+
+        wps->dc.error [0] = wp_exp2s ((int16_t)(byteptr [0] + (byteptr [1] << 8)));
+        wps->dc.shaping_acc [0] = wp_exp2s ((int16_t)(byteptr [2] + (byteptr [3] << 8)));
+        byteptr += 4;
+
+        if (!(wps->wphdr.flags & MONO_DATA)) {
+            wps->dc.error [1] = wp_exp2s ((int16_t)(byteptr [0] + (byteptr [1] << 8)));
+            wps->dc.shaping_acc [1] = wp_exp2s ((int16_t)(byteptr [2] + (byteptr [3] << 8)));
+            byteptr += 4;
+        }
+
+        if (wpmd->byte_length == (wps->wphdr.flags & MONO_DATA ? 6 : 12)) {
+            wps->dc.shaping_delta [0] = wp_exp2s ((int16_t)(byteptr [0] + (byteptr [1] << 8)));
+
+            if (!(wps->wphdr.flags & MONO_DATA))
+                wps->dc.shaping_delta [1] = wp_exp2s ((int16_t)(byteptr [2] + (byteptr [3] << 8)));
+        }
+
+        return TRUE;
+    }
+
+    return FALSE;
+}
diff --git a/third_party/wavpack/src/entropy_utils.c b/third_party/wavpack/src/entropy_utils.c
new file mode 100644
index 0000000..fe8e405
--- /dev/null
+++ b/third_party/wavpack/src/entropy_utils.c
@@ -0,0 +1,378 @@
+////////////////////////////////////////////////////////////////////////////
+//                           **** WAVPACK ****                            //
+//                  Hybrid Lossless Wavefile Compressor                   //
+//              Copyright (c) 1998 - 2013 Conifer Software.               //
+//                          All Rights Reserved.                          //
+//      Distributed under the BSD Software License (see license.txt)      //
+////////////////////////////////////////////////////////////////////////////
+
+// entropy_utils.c
+
+// This module contains the functions that process metadata blocks that are
+// specific to the entropy decoder; these would be called any time a WavPack
+// block was parsed. Additionally, it contains tables and functions that are
+// common to both entropy coding and decoding. These are in a module separate
+// from the actual entropy encoder (write_words.c) and decoder (read_words.c)
+// so that if applications that just do a subset of the full WavPack reading
+// and writing can link with a subset of the library.
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "wavpack_local.h"
+
+///////////////////////////// local table storage ////////////////////////////
+
+const uint32_t bitset [] = {
+    1L << 0, 1L << 1, 1L << 2, 1L << 3,
+    1L << 4, 1L << 5, 1L << 6, 1L << 7,
+    1L << 8, 1L << 9, 1L << 10, 1L << 11,
+    1L << 12, 1L << 13, 1L << 14, 1L << 15,
+    1L << 16, 1L << 17, 1L << 18, 1L << 19,
+    1L << 20, 1L << 21, 1L << 22, 1L << 23,
+    1L << 24, 1L << 25, 1L << 26, 1L << 27,
+    1L << 28, 1L << 29, 1L << 30, 1L << 31
+};
+
+const uint32_t bitmask [] = {
+    (1L << 0) - 1, (1L << 1) - 1, (1L << 2) - 1, (1L << 3) - 1,
+    (1L << 4) - 1, (1L << 5) - 1, (1L << 6) - 1, (1L << 7) - 1,
+    (1L << 8) - 1, (1L << 9) - 1, (1L << 10) - 1, (1L << 11) - 1,
+    (1L << 12) - 1, (1L << 13) - 1, (1L << 14) - 1, (1L << 15) - 1,
+    (1L << 16) - 1, (1L << 17) - 1, (1L << 18) - 1, (1L << 19) - 1,
+    (1L << 20) - 1, (1L << 21) - 1, (1L << 22) - 1, (1L << 23) - 1,
+    (1L << 24) - 1, (1L << 25) - 1, (1L << 26) - 1, (1L << 27) - 1,
+    (1L << 28) - 1, (1L << 29) - 1, (1L << 30) - 1, 0x7fffffff
+};
+
+const char nbits_table [] = {
+    0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,     // 0 - 15
+    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,     // 16 - 31
+    6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,     // 32 - 47
+    6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,     // 48 - 63
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,     // 64 - 79
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,     // 80 - 95
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,     // 96 - 111
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,     // 112 - 127
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,     // 128 - 143
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,     // 144 - 159
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,     // 160 - 175
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,     // 176 - 191
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,     // 192 - 207
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,     // 208 - 223
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,     // 224 - 239
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8      // 240 - 255
+};
+
+static const unsigned char log2_table [] = {
+    0x00, 0x01, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0b, 0x0d, 0x0e, 0x10, 0x11, 0x12, 0x14, 0x15,
+    0x16, 0x18, 0x19, 0x1a, 0x1c, 0x1d, 0x1e, 0x20, 0x21, 0x22, 0x24, 0x25, 0x26, 0x28, 0x29, 0x2a,
+    0x2c, 0x2d, 0x2e, 0x2f, 0x31, 0x32, 0x33, 0x34, 0x36, 0x37, 0x38, 0x39, 0x3b, 0x3c, 0x3d, 0x3e,
+    0x3f, 0x41, 0x42, 0x43, 0x44, 0x45, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4d, 0x4e, 0x4f, 0x50, 0x51,
+    0x52, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63,
+    0x64, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x74, 0x75,
+    0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85,
+    0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95,
+    0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
+    0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb2,
+    0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc0,
+    0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcb, 0xcc, 0xcd, 0xce,
+    0xcf, 0xd0, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd8, 0xd9, 0xda, 0xdb,
+    0xdc, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe4, 0xe5, 0xe6, 0xe7, 0xe7,
+    0xe8, 0xe9, 0xea, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xee, 0xef, 0xf0, 0xf1, 0xf1, 0xf2, 0xf3, 0xf4,
+    0xf4, 0xf5, 0xf6, 0xf7, 0xf7, 0xf8, 0xf9, 0xf9, 0xfa, 0xfb, 0xfc, 0xfc, 0xfd, 0xfe, 0xff, 0xff
+};
+
+static const unsigned char exp2_table [] = {
+    0x00, 0x01, 0x01, 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x08, 0x08, 0x09, 0x0a, 0x0b,
+    0x0b, 0x0c, 0x0d, 0x0e, 0x0e, 0x0f, 0x10, 0x10, 0x11, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16, 0x16,
+    0x17, 0x18, 0x19, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1d, 0x1e, 0x1f, 0x20, 0x20, 0x21, 0x22, 0x23,
+    0x24, 0x24, 0x25, 0x26, 0x27, 0x28, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
+    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3a, 0x3b, 0x3c, 0x3d,
+    0x3e, 0x3f, 0x40, 0x41, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x48, 0x49, 0x4a, 0x4b,
+    0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a,
+    0x5b, 0x5c, 0x5d, 0x5e, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+    0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+    0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x87, 0x88, 0x89, 0x8a,
+    0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b,
+    0x9c, 0x9d, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad,
+    0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0,
+    0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc8, 0xc9, 0xca, 0xcb, 0xcd, 0xce, 0xcf, 0xd0, 0xd2, 0xd3, 0xd4,
+    0xd6, 0xd7, 0xd8, 0xd9, 0xdb, 0xdc, 0xdd, 0xde, 0xe0, 0xe1, 0xe2, 0xe4, 0xe5, 0xe6, 0xe8, 0xe9,
+    0xea, 0xec, 0xed, 0xee, 0xf0, 0xf1, 0xf2, 0xf4, 0xf5, 0xf6, 0xf8, 0xf9, 0xfa, 0xfc, 0xfd, 0xff
+};
+
+///////////////////////////// executable code ////////////////////////////////
+
+// Read the median log2 values from the specifed metadata structure, convert
+// them back to 32-bit unsigned values and store them. If length is not
+// exactly correct then we flag and return an error.
+
+int read_entropy_vars (WavpackStream *wps, WavpackMetadata *wpmd)
+{
+    unsigned char *byteptr = (unsigned char *)wpmd->data;
+
+    if (wpmd->byte_length != ((wps->wphdr.flags & MONO_DATA) ? 6 : 12))
+        return FALSE;
+
+    wps->w.c [0].median [0] = wp_exp2s (byteptr [0] + (byteptr [1] << 8));
+    wps->w.c [0].median [1] = wp_exp2s (byteptr [2] + (byteptr [3] << 8));
+    wps->w.c [0].median [2] = wp_exp2s (byteptr [4] + (byteptr [5] << 8));
+
+    if (!(wps->wphdr.flags & MONO_DATA)) {
+        wps->w.c [1].median [0] = wp_exp2s (byteptr [6] + (byteptr [7] << 8));
+        wps->w.c [1].median [1] = wp_exp2s (byteptr [8] + (byteptr [9] << 8));
+        wps->w.c [1].median [2] = wp_exp2s (byteptr [10] + (byteptr [11] << 8));
+    }
+
+    return TRUE;
+}
+
+// Read the hybrid related values from the specifed metadata structure, convert
+// them back to their internal formats and store them. The extended profile
+// stuff is not implemented yet, so return an error if we get more data than
+// we know what to do with.
+
+int read_hybrid_profile (WavpackStream *wps, WavpackMetadata *wpmd)
+{
+    unsigned char *byteptr = (unsigned char *)wpmd->data;
+    unsigned char *endptr = byteptr + wpmd->byte_length;
+
+    if (wps->wphdr.flags & HYBRID_BITRATE) {
+        if (byteptr + (wps->wphdr.flags & MONO_DATA ? 2 : 4) > endptr)
+            return FALSE;
+
+        wps->w.c [0].slow_level = wp_exp2s (byteptr [0] + (byteptr [1] << 8));
+        byteptr += 2;
+
+        if (!(wps->wphdr.flags & MONO_DATA)) {
+            wps->w.c [1].slow_level = wp_exp2s (byteptr [0] + (byteptr [1] << 8));
+            byteptr += 2;
+        }
+    }
+
+    if (byteptr + (wps->wphdr.flags & MONO_DATA ? 2 : 4) > endptr)
+        return FALSE;
+
+    wps->w.bitrate_acc [0] = (int32_t)(byteptr [0] + (byteptr [1] << 8)) << 16;
+    byteptr += 2;
+
+    if (!(wps->wphdr.flags & MONO_DATA)) {
+        wps->w.bitrate_acc [1] = (int32_t)(byteptr [0] + (byteptr [1] << 8)) << 16;
+        byteptr += 2;
+    }
+
+    if (byteptr < endptr) {
+        if (byteptr + (wps->wphdr.flags & MONO_DATA ? 2 : 4) > endptr)
+            return FALSE;
+
+        wps->w.bitrate_delta [0] = wp_exp2s ((int16_t)(byteptr [0] + (byteptr [1] << 8)));
+        byteptr += 2;
+
+        if (!(wps->wphdr.flags & MONO_DATA)) {
+            wps->w.bitrate_delta [1] = wp_exp2s ((int16_t)(byteptr [0] + (byteptr [1] << 8)));
+            byteptr += 2;
+        }
+
+        if (byteptr < endptr)
+            return FALSE;
+    }
+    else
+        wps->w.bitrate_delta [0] = wps->w.bitrate_delta [1] = 0;
+
+    return TRUE;
+}
+
+// This function is called during both encoding and decoding of hybrid data to
+// update the "error_limit" variable which determines the maximum sample error
+// allowed in the main bitstream. In the HYBRID_BITRATE mode (which is the only
+// currently implemented) this is calculated from the slow_level values and the
+// bitrate accumulators. Note that the bitrate accumulators can be changing.
+
+void update_error_limit (WavpackStream *wps)
+{
+    int bitrate_0 = (wps->w.bitrate_acc [0] += wps->w.bitrate_delta [0]) >> 16;
+
+    if (wps->wphdr.flags & MONO_DATA) {
+        if (wps->wphdr.flags & HYBRID_BITRATE) {
+            int slow_log_0 = (wps->w.c [0].slow_level + SLO) >> SLS;
+
+            if (slow_log_0 - bitrate_0 > -0x100)
+                wps->w.c [0].error_limit = wp_exp2s (slow_log_0 - bitrate_0 + 0x100);
+            else
+                wps->w.c [0].error_limit = 0;
+        }
+        else
+            wps->w.c [0].error_limit = wp_exp2s (bitrate_0);
+    }
+    else {
+        int bitrate_1 = (wps->w.bitrate_acc [1] += wps->w.bitrate_delta [1]) >> 16;
+
+        if (wps->wphdr.flags & HYBRID_BITRATE) {
+            int slow_log_0 = (wps->w.c [0].slow_level + SLO) >> SLS;
+            int slow_log_1 = (wps->w.c [1].slow_level + SLO) >> SLS;
+
+            if (wps->wphdr.flags & HYBRID_BALANCE) {
+                int balance = (slow_log_1 - slow_log_0 + bitrate_1 + 1) >> 1;
+
+                if (balance > bitrate_0) {
+                    bitrate_1 = bitrate_0 * 2;
+                    bitrate_0 = 0;
+                }
+                else if (-balance > bitrate_0) {
+                    bitrate_0 = bitrate_0 * 2;
+                    bitrate_1 = 0;
+                }
+                else {
+                    bitrate_1 = bitrate_0 + balance;
+                    bitrate_0 = bitrate_0 - balance;
+                }
+            }
+
+            if (slow_log_0 - bitrate_0 > -0x100)
+                wps->w.c [0].error_limit = wp_exp2s (slow_log_0 - bitrate_0 + 0x100);
+            else
+                wps->w.c [0].error_limit = 0;
+
+            if (slow_log_1 - bitrate_1 > -0x100)
+                wps->w.c [1].error_limit = wp_exp2s (slow_log_1 - bitrate_1 + 0x100);
+            else
+                wps->w.c [1].error_limit = 0;
+        }
+        else {
+            wps->w.c [0].error_limit = wp_exp2s (bitrate_0);
+            wps->w.c [1].error_limit = wp_exp2s (bitrate_1);
+        }
+    }
+}
+
+// The concept of a base 2 logarithm is used in many parts of WavPack. It is
+// a way of sufficiently accurately representing 32-bit signed and unsigned
+// values storing only 16 bits (actually fewer). It is also used in the hybrid
+// mode for quickly comparing the relative magnitude of large values (i.e.
+// division) and providing smooth exponentials using only addition.
+
+// These are not strict logarithms in that they become linear around zero and
+// can therefore represent both zero and negative values. They have 8 bits
+// of precision and in "roundtrip" conversions the total error never exceeds 1
+// part in 225 except for the cases of +/-115 and +/-195 (which error by 1).
+
+
+// This function returns the log2 for the specified 32-bit unsigned value.
+// The maximum value allowed is about 0xff800000 and returns 8447.
+
+int FASTCALL wp_log2 (uint32_t avalue)
+{
+    int dbits;
+
+    if ((avalue += avalue >> 9) < (1 << 8)) {
+        dbits = nbits_table [avalue];
+        return (dbits << 8) + log2_table [(avalue << (9 - dbits)) & 0xff];
+    }
+    else {
+        if (avalue < (1L << 16))
+            dbits = nbits_table [avalue >> 8] + 8;
+        else if (avalue < (1L << 24))
+            dbits = nbits_table [avalue >> 16] + 16;
+        else
+            dbits = nbits_table [avalue >> 24] + 24;
+
+        return (dbits << 8) + log2_table [(avalue >> (dbits - 9)) & 0xff];
+    }
+}
+
+// This function scans a buffer of longs and accumulates the total log2 value
+// of all the samples. This is useful for determining maximum compression
+// because the bitstream storage required for entropy coding is proportional
+// to the base 2 log of the samples. On some platforms there is an assembly
+// version of this.
+
+#if !defined(OPT_ASM_X86) && !defined(OPT_ASM_X64)
+
+uint32_t log2buffer (int32_t *samples, uint32_t num_samples, int limit)
+{
+    uint32_t result = 0, avalue;
+    int dbits;
+
+    while (num_samples--) {
+        avalue = abs (*samples++);
+
+        if ((avalue += avalue >> 9) < (1 << 8)) {
+            dbits = nbits_table [avalue];
+            result += (dbits << 8) + log2_table [(avalue << (9 - dbits)) & 0xff];
+        }
+        else {
+            if (avalue < (1L << 16))
+                dbits = nbits_table [avalue >> 8] + 8;
+            else if (avalue < (1L << 24))
+                dbits = nbits_table [avalue >> 16] + 16;
+            else
+                dbits = nbits_table [avalue >> 24] + 24;
+
+            result += dbits = (dbits << 8) + log2_table [(avalue >> (dbits - 9)) & 0xff];
+
+            if (limit && dbits >= limit)
+                return (uint32_t) -1;
+        }
+    }
+
+    return result;
+}
+
+#endif
+
+// This function returns the log2 for the specified 32-bit signed value.
+// All input values are valid and the return values are in the range of
+// +/- 8192.
+
+int wp_log2s (int32_t value)
+{
+    return (value < 0) ? -wp_log2 (-value) : wp_log2 (value);
+}
+
+// This function returns the original integer represented by the supplied
+// logarithm (at least within the provided accuracy). The log is signed,
+// but since a full 32-bit value is returned this can be used for unsigned
+// conversions as well (i.e. the input range is -8192 to +8447).
+
+int32_t wp_exp2s (int log)
+{
+    uint32_t value;
+
+    if (log < 0)
+        return -wp_exp2s (-log);
+
+    value = exp2_table [log & 0xff] | 0x100;
+
+    if ((log >>= 8) <= 9)
+        return value >> (9 - log);
+    else
+        return value << (log - 9);
+}
+
+// These two functions convert internal weights (which are normally +/-1024)
+// to and from an 8-bit signed character version for storage in metadata. The
+// weights are clipped here in the case that they are outside that range.
+
+signed char store_weight (int weight)
+{
+    if (weight > 1024)
+        weight = 1024;
+    else if (weight < -1024)
+        weight = -1024;
+
+    if (weight > 0)
+        weight -= (weight + 64) >> 7;
+
+    return (weight + 4) >> 3;
+}
+
+int restore_weight (signed char weight)
+{
+    int result;
+
+    if ((result = (int) weight << 3) > 0)
+        result += (result + 64) >> 7;
+
+    return result;
+}
diff --git a/third_party/wavpack/src/extra1.c b/third_party/wavpack/src/extra1.c
index 4936fb6..80a6362 100644
--- a/third_party/wavpack/src/extra1.c
+++ b/third_party/wavpack/src/extra1.c
@@ -1,7 +1,7 @@
 ////////////////////////////////////////////////////////////////////////////
 //                           **** WAVPACK ****                            //
 //                  Hybrid Lossless Wavefile Compressor                   //
-//              Copyright (c) 1998 - 2006 Conifer Software.               //
+//              Copyright (c) 1998 - 2013 Conifer Software.               //
 //                          All Rights Reserved.                          //
 //      Distributed under the BSD Software License (see license.txt)      //
 ////////////////////////////////////////////////////////////////////////////
@@ -10,28 +10,41 @@
 
 // This module handles the "extra" mode for mono files.
 
-#include "wavpack_local.h"
-
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 
-//#define USE_OVERHEAD
-#define LOG_LIMIT 6912
-//#define EXTRA_DUMP
+#include "wavpack_local.h"
 
-#ifdef DEBUG_ALLOC
-#define malloc malloc_db
-#define realloc realloc_db
-#define free free_db
-void *malloc_db (uint32_t size);
-void *realloc_db (void *ptr, uint32_t size);
-void free_db (void *ptr);
-int32_t dump_alloc (void);
+// This flag causes this module to take into account the size of the header
+// (which grows with more decorrelation passes) when making decisions about
+// adding additional passes (as opposed to just considering the resulting
+// magnitude of the residuals). With really long blocks it seems to actually
+// hurt compression (for reasons I cannot explain), but with short blocks it
+// works okay, so we're enabling it for now.
+
+#define USE_OVERHEAD
+
+// If the log2 value of any sample in a buffer being scanned exceeds this value,
+// we abandon that configuration. This prevents us from going down paths that
+// are wildly unstable.
+
+#define LOG_LIMIT 6912
+
+//#define EXTRA_DUMP        // dump generated filter data  error_line()
+
+#ifdef OPT_ASM_X86
+    #define PACK_DECORR_MONO_PASS_CONT pack_decorr_mono_pass_cont_x86
+#elif defined(OPT_ASM_X64) && (defined (_WIN64) || defined(__CYGWIN__) || defined(__MINGW64__))
+    #define PACK_DECORR_MONO_PASS_CONT pack_decorr_mono_pass_cont_x64win
+#elif defined(OPT_ASM_X64)
+    #define PACK_DECORR_MONO_PASS_CONT pack_decorr_mono_pass_cont_x64
 #endif
 
-//////////////////////////////// local tables ///////////////////////////////
+#ifdef PACK_DECORR_MONO_PASS_CONT
+    void PACK_DECORR_MONO_PASS_CONT (int32_t *out_buffer, int32_t *in_buffer,  struct decorr_pass *dpp, int32_t sample_count);
+#endif
 
 typedef struct {
     int32_t *sampleptrs [MAX_NTERMS+2];
@@ -42,13 +55,22 @@ typedef struct {
 
 static void decorr_mono_pass (int32_t *in_samples, int32_t *out_samples, uint32_t num_samples, struct decorr_pass *dpp, int dir)
 {
+    int32_t cont_samples = 0;
     int m = 0, i;
 
+#ifdef PACK_DECORR_MONO_PASS_CONT
+    if (num_samples > 16 && dir > 0) {
+        int32_t pre_samples = (dpp->term > MAX_TERM) ? 2 : dpp->term;
+        cont_samples = num_samples - pre_samples;
+        num_samples = pre_samples;
+    }
+#endif
+
     dpp->sum_A = 0;
 
     if (dir < 0) {
-        out_samples += (num_samples - 1);
-        in_samples += (num_samples - 1);
+        out_samples += (num_samples + cont_samples - 1);
+        in_samples += (num_samples + cont_samples - 1);
         dir = -1;
     }
     else
@@ -57,7 +79,7 @@ static void decorr_mono_pass (int32_t *in_samples, int32_t *out_samples, uint32_
     dpp->weight_A = restore_weight (store_weight (dpp->weight_A));
 
     for (i = 0; i < 8; ++i)
-        dpp->samples_A [i] = exp2s (log2s (dpp->samples_A [i]));
+        dpp->samples_A [i] = wp_exp2s (wp_log2s (dpp->samples_A [i]));
 
     if (dpp->term > MAX_TERM) {
         while (num_samples--) {
@@ -108,6 +130,11 @@ static void decorr_mono_pass (int32_t *in_samples, int32_t *out_samples, uint32_
             m = (m + 1) & (MAX_TERM - 1);
         }
     }
+
+#ifdef PACK_DECORR_MONO_PASS_CONT
+    if (cont_samples)
+        PACK_DECORR_MONO_PASS_CONT (out_samples, in_samples, dpp, cont_samples);
+#endif
 }
 
 static void reverse_mono_decorr (struct decorr_pass *dpp)
@@ -224,7 +251,7 @@ static void recurse_mono (WavpackContext *wpc, WavpackExtraInfo *info, int depth
         info->dps [depth].term = term;
         info->dps [depth].delta = delta;
         decorr_mono_buffer (samples, outsamples, wps->wphdr.block_samples, info->dps, depth);
-        bits = log2buffer (outsamples, wps->wphdr.block_samples, info->log_limit);
+        bits = LOG2BUFFER (outsamples, wps->wphdr.block_samples, info->log_limit);
 
         if (bits != (uint32_t) -1)
             bits += log2overhead (info->dps [0].term, depth + 1);
@@ -289,7 +316,7 @@ static void delta_mono (WavpackContext *wpc, WavpackExtraInfo *info)
             decorr_mono_buffer (info->sampleptrs [i], info->sampleptrs [i+1], wps->wphdr.block_samples, info->dps, i);
         }
 
-        bits = log2buffer (info->sampleptrs [i], wps->wphdr.block_samples, info->log_limit);
+        bits = LOG2BUFFER (info->sampleptrs [i], wps->wphdr.block_samples, info->log_limit);
 
         if (bits != (uint32_t) -1)
             bits += log2overhead (wps->decorr_passes [0].term, i);
@@ -314,7 +341,7 @@ static void delta_mono (WavpackContext *wpc, WavpackExtraInfo *info)
             decorr_mono_buffer (info->sampleptrs [i], info->sampleptrs [i+1], wps->wphdr.block_samples, info->dps, i);
         }
 
-        bits = log2buffer (info->sampleptrs [i], wps->wphdr.block_samples, info->log_limit);
+        bits = LOG2BUFFER (info->sampleptrs [i], wps->wphdr.block_samples, info->log_limit);
 
         if (bits != (uint32_t) -1)
             bits += log2overhead (wps->decorr_passes [0].term, i);
@@ -358,7 +385,7 @@ static void sort_mono (WavpackContext *wpc, WavpackExtraInfo *info)
             for (i = ri; i < info->nterms && wps->decorr_passes [i].term; ++i)
                 decorr_mono_buffer (info->sampleptrs [i], info->sampleptrs [i+1], wps->wphdr.block_samples, info->dps, i);
 
-            bits = log2buffer (info->sampleptrs [i], wps->wphdr.block_samples, info->log_limit);
+            bits = LOG2BUFFER (info->sampleptrs [i], wps->wphdr.block_samples, info->log_limit);
 
             if (bits != (uint32_t) -1)
                 bits += log2overhead (wps->decorr_passes [0].term, i);
@@ -412,13 +439,13 @@ static void analyze_mono (WavpackContext *wpc, int32_t *samples, int do_samples)
     for (i = 0; i < info.nterms && info.dps [i].term; ++i)
         decorr_mono_pass (info.sampleptrs [i], info.sampleptrs [i + 1], wps->wphdr.block_samples, info.dps + i, 1);
 
-    info.best_bits = log2buffer (info.sampleptrs [info.nterms], wps->wphdr.block_samples, 0) * 1;
+    info.best_bits = LOG2BUFFER (info.sampleptrs [info.nterms], wps->wphdr.block_samples, 0) * 1;
     info.best_bits += log2overhead (info.dps [0].term, i);
     memcpy (info.sampleptrs [info.nterms + 1], info.sampleptrs [i], wps->wphdr.block_samples * 4);
 
     if (wpc->config.extra_flags & EXTRA_BRANCHES)
         recurse_mono (wpc, &info, 0, (int) floor (wps->delta_decay + 0.5),
-            log2buffer (info.sampleptrs [0], wps->wphdr.block_samples, 0));
+            LOG2BUFFER (info.sampleptrs [0], wps->wphdr.block_samples, 0));
 
     if (wpc->config.extra_flags & EXTRA_SORT_FIRST)
         sort_mono (wpc, &info);
@@ -500,6 +527,12 @@ void execute_mono (WavpackContext *wpc, int32_t *samples, int no_history, int do
     uint32_t best_size = (uint32_t) -1, size;
     int log_limit, pi, i;
 
+#ifdef SKIP_DECORRELATION
+    CLEAR (wps->decorr_passes);
+    wps->num_terms = 0;
+    return;
+#endif
+
     for (i = 0; i < num_samples; ++i)
         if (samples [i])
             break;
@@ -571,7 +604,7 @@ void execute_mono (WavpackContext *wpc, int32_t *samples, int no_history, int do
         }
 
         wpds = &wps->decorr_specs [c];
-        nterms = (int) strlen (wpds->terms);
+        nterms = (int) strlen ((char *) wpds->terms);
 
         while (1) {
         memcpy (temp_buffer [0], noisy_buffer ? noisy_buffer : samples, buf_size);
@@ -598,7 +631,7 @@ void execute_mono (WavpackContext *wpc, int32_t *samples, int no_history, int do
             decorr_mono_pass (temp_buffer [j&1], temp_buffer [~j&1], num_samples, &temp_decorr_pass, 1);
         }
 
-        size = log2buffer (temp_buffer [j&1], num_samples, log_limit);
+        size = LOG2BUFFER (temp_buffer [j&1], num_samples, log_limit);
 
         if (size == (uint32_t) -1 && nterms)
             nterms >>= 1;
diff --git a/third_party/wavpack/src/extra2.c b/third_party/wavpack/src/extra2.c
index 17fcf01..8a1a305 100644
--- a/third_party/wavpack/src/extra2.c
+++ b/third_party/wavpack/src/extra2.c
@@ -1,7 +1,7 @@
 ////////////////////////////////////////////////////////////////////////////
 //                           **** WAVPACK ****                            //
 //                  Hybrid Lossless Wavefile Compressor                   //
-//              Copyright (c) 1998 - 2006 Conifer Software.               //
+//              Copyright (c) 1998 - 2013 Conifer Software.               //
 //               MMX optimizations (c) 2006 Joachim Henke                 //
 //                          All Rights Reserved.                          //
 //      Distributed under the BSD Software License (see license.txt)      //
@@ -11,325 +11,74 @@
 
 // This module handles the "extra" mode for stereo files.
 
-#include "wavpack_local.h"
-
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 
-//#define USE_OVERHEAD
-#define LOG_LIMIT 6912
-//#define EXTRA_DUMP
+#include "wavpack_local.h"
 
-#ifdef DEBUG_ALLOC
-#define malloc malloc_db
-#define realloc realloc_db
-#define free free_db
-void *malloc_db (uint32_t size);
-void *realloc_db (void *ptr, uint32_t size);
-void free_db (void *ptr);
-int32_t dump_alloc (void);
+// This flag causes this module to take into account the size of the header
+// (which grows with more decorrelation passes) when making decisions about
+// adding additional passes (as opposed to just considering the resulting
+// magnitude of the residuals). With really long blocks it seems to actually
+// hurt compression (for reasons I cannot explain), but with short blocks it
+// works okay, so we're enabling it for now.
+
+#define USE_OVERHEAD
+
+// If the log2 value of any sample in a buffer being scanned exceeds this value,
+// we abandon that configuration. This prevents us from going down paths that
+// are wildly unstable.
+
+#define LOG_LIMIT 6912
+
+//#define EXTRA_DUMP        // dump generated filter data to error_line()
+
+#ifdef OPT_ASM_X86
+    #define PACK_DECORR_STEREO_PASS_CONT pack_decorr_stereo_pass_cont_x86
+    #define PACK_DECORR_STEREO_PASS_CONT_REV pack_decorr_stereo_pass_cont_rev_x86
+    #define PACK_DECORR_STEREO_PASS_CONT_AVAILABLE pack_cpu_has_feature_x86(CPU_FEATURE_MMX)
+#elif defined(OPT_ASM_X64) && (defined (_WIN64) || defined(__CYGWIN__) || defined(__MINGW64__))
+    #define PACK_DECORR_STEREO_PASS_CONT pack_decorr_stereo_pass_cont_x64win
+    #define PACK_DECORR_STEREO_PASS_CONT_REV pack_decorr_stereo_pass_cont_rev_x64win
+    #define PACK_DECORR_STEREO_PASS_CONT_AVAILABLE 1
+#elif defined(OPT_ASM_X64)
+    #define PACK_DECORR_STEREO_PASS_CONT pack_decorr_stereo_pass_cont_x64
+    #define PACK_DECORR_STEREO_PASS_CONT_REV pack_decorr_stereo_pass_cont_rev_x64
+    #define PACK_DECORR_STEREO_PASS_CONT_AVAILABLE 1
 #endif
 
-//////////////////////////////// local tables ///////////////////////////////
+#ifdef PACK_DECORR_STEREO_PASS_CONT
+    void PACK_DECORR_STEREO_PASS_CONT (struct decorr_pass *dpp, int32_t *in_buffer, int32_t *out_buffer, int32_t sample_count);
+    void PACK_DECORR_STEREO_PASS_CONT_REV (struct decorr_pass *dpp, int32_t *in_buffer, int32_t *out_buffer, int32_t sample_count);
+#endif
 
 typedef struct {
     int32_t *sampleptrs [MAX_NTERMS+2];
     struct decorr_pass dps [MAX_NTERMS];
-    int nterms, log_limit, gt16bit;
+    int nterms, log_limit;
     uint32_t best_bits;
 } WavpackExtraInfo;
 
-#ifdef OPT_MMX
-
-static void decorr_stereo_pass (int32_t *in_samples, int32_t *out_samples, int32_t num_samples, struct decorr_pass *dpp, int dir)
-{
-    const __m64
-        delta = _mm_set1_pi32 (dpp->delta),
-        fill = _mm_set1_pi32 (0x7bff),
-        mask = _mm_set1_pi32 (0x7fff),
-        round = _mm_set1_pi32 (512),
-        zero = _mm_set1_pi32 (0);
-    __m64
-        sum_AB = zero,
-        weight_AB = _mm_set_pi32 (restore_weight (store_weight (dpp->weight_B)), restore_weight (store_weight (dpp->weight_A))),
-        left_right, sam_AB, tmp0, tmp1, samples_AB [MAX_TERM];
-    int k, m = 0;
-
-    if (dir < 0) {
-        out_samples += (num_samples - 1) * 2;
-        in_samples += (num_samples - 1) * 2;
-        dir = -2;
-    }
-    else
-        dir = 2;
-
-    for (k = 0; k < MAX_TERM; ++k) {
-        ((int32_t *) samples_AB) [k * 2] = exp2s (log2s (dpp->samples_A [k]));
-        ((int32_t *) samples_AB) [k * 2 + 1] = exp2s (log2s (dpp->samples_B [k]));
-    }
-
-    if (dpp->term > 0) {
-        if (dpp->term == 17) {
-            while (num_samples--) {
-                left_right = *(__m64 *) in_samples;
-                tmp0 = samples_AB [0];
-                sam_AB = _m_paddd (tmp0, tmp0);
-                sam_AB = _m_psubd (sam_AB, samples_AB [1]);
-                samples_AB [0] = left_right;
-                samples_AB [1] = tmp0;
-
-                tmp0 = _m_paddd (sam_AB, sam_AB);
-                tmp1 = _m_pand (sam_AB, mask);
-                tmp0 = _m_psrldi (tmp0, 16);
-                tmp1 = _m_pmaddwd (tmp1, weight_AB);
-                tmp0 = _m_pmaddwd (tmp0, weight_AB);
-                tmp1 = _m_paddd (tmp1, round);
-                tmp0 = _m_pslldi (tmp0, 5);
-                tmp1 = _m_psradi (tmp1, 10);
-                left_right = _m_psubd (left_right, tmp0);
-                left_right = _m_psubd (left_right, tmp1);
-
-                *(__m64 *) out_samples = left_right;
-
-                tmp0 = _m_pxor (sam_AB, left_right);
-                tmp0 = _m_psradi (tmp0, 31);
-                tmp1 = _m_pxor (delta, tmp0);
-                tmp1 = _m_psubd (tmp1, tmp0);
-                sam_AB = _m_pcmpeqd (sam_AB, zero);
-                tmp0 = _m_pcmpeqd (left_right, zero);
-                tmp0 = _m_por (tmp0, sam_AB);
-                tmp0 = _m_pandn (tmp0, tmp1);
-                weight_AB = _m_paddd (weight_AB, tmp0);
-
-                sum_AB = _m_paddd (sum_AB, weight_AB);
-
-                in_samples += dir;
-                out_samples += dir;
-            }
-        }
-        else if (dpp->term == 18) {
-            while (num_samples--) {
-                left_right = *(__m64 *) in_samples;
-                tmp0 = samples_AB [0];
-                sam_AB = _m_psubd (tmp0, samples_AB [1]);
-                sam_AB = _m_psradi (sam_AB, 1);
-                sam_AB = _m_paddd (sam_AB, tmp0);
-                samples_AB [0] = left_right;
-                samples_AB [1] = tmp0;
-
-                tmp0 = _m_paddd (sam_AB, sam_AB);
-                tmp1 = _m_pand (sam_AB, mask);
-                tmp0 = _m_psrldi (tmp0, 16);
-                tmp1 = _m_pmaddwd (tmp1, weight_AB);
-                tmp0 = _m_pmaddwd (tmp0, weight_AB);
-                tmp1 = _m_paddd (tmp1, round);
-                tmp0 = _m_pslldi (tmp0, 5);
-                tmp1 = _m_psradi (tmp1, 10);
-                left_right = _m_psubd (left_right, tmp0);
-                left_right = _m_psubd (left_right, tmp1);
-
-                *(__m64 *) out_samples = left_right;
-
-                tmp0 = _m_pxor (sam_AB, left_right);
-                tmp0 = _m_psradi (tmp0, 31);
-                tmp1 = _m_pxor (delta, tmp0);
-                tmp1 = _m_psubd (tmp1, tmp0);
-                sam_AB = _m_pcmpeqd (sam_AB, zero);
-                tmp0 = _m_pcmpeqd (left_right, zero);
-                tmp0 = _m_por (tmp0, sam_AB);
-                tmp0 = _m_pandn (tmp0, tmp1);
-                weight_AB = _m_paddd (weight_AB, tmp0);
-
-                sum_AB = _m_paddd (sum_AB, weight_AB);
-
-                in_samples += dir;
-                out_samples += dir;
-            }
-        }
-        else {
-            k = dpp->term & (MAX_TERM - 1);
-            while (num_samples--) {
-                left_right = *(__m64 *) in_samples;
-                sam_AB = samples_AB [m];
-                samples_AB [k] = left_right;
-
-                tmp0 = _m_paddd (sam_AB, sam_AB);
-                tmp1 = _m_pand (sam_AB, mask);
-                tmp0 = _m_psrldi (tmp0, 16);
-                tmp1 = _m_pmaddwd (tmp1, weight_AB);
-                tmp0 = _m_pmaddwd (tmp0, weight_AB);
-                tmp1 = _m_paddd (tmp1, round);
-                tmp0 = _m_pslldi (tmp0, 5);
-                tmp1 = _m_psradi (tmp1, 10);
-                left_right = _m_psubd (left_right, tmp0);
-                left_right = _m_psubd (left_right, tmp1);
-
-                *(__m64 *) out_samples = left_right;
-
-                tmp0 = _m_pxor (sam_AB, left_right);
-                tmp0 = _m_psradi (tmp0, 31);
-                tmp1 = _m_pxor (delta, tmp0);
-                tmp1 = _m_psubd (tmp1, tmp0);
-                sam_AB = _m_pcmpeqd (sam_AB, zero);
-                tmp0 = _m_pcmpeqd (left_right, zero);
-                tmp0 = _m_por (tmp0, sam_AB);
-                tmp0 = _m_pandn (tmp0, tmp1);
-                weight_AB = _m_paddd (weight_AB, tmp0);
-
-                sum_AB = _m_paddd (sum_AB, weight_AB);
-
-                in_samples += dir;
-                out_samples += dir;
-                k = (k + 1) & (MAX_TERM - 1);
-                m = (m + 1) & (MAX_TERM - 1);
-            }
-        }
-    }
-    else {
-        if (dpp->term == -1) {
-            while (num_samples--) {
-                left_right = *(__m64 *) in_samples;
-                sam_AB = samples_AB [0];
-                samples_AB [0] = _m_punpckhdq (left_right, sam_AB);
-                sam_AB = _m_punpckldq (sam_AB, left_right);
-
-                tmp0 = _m_paddd (sam_AB, sam_AB);
-                tmp1 = _m_pand (sam_AB, mask);
-                tmp0 = _m_psrldi (tmp0, 16);
-                tmp1 = _m_pmaddwd (tmp1, weight_AB);
-                tmp0 = _m_pmaddwd (tmp0, weight_AB);
-                tmp1 = _m_paddd (tmp1, round);
-                tmp0 = _m_pslldi (tmp0, 5);
-                tmp1 = _m_psradi (tmp1, 10);
-                left_right = _m_psubd (left_right, tmp0);
-                left_right = _m_psubd (left_right, tmp1);
-
-                *(__m64 *) out_samples = left_right;
-
-                tmp0 = _m_pcmpeqd (sam_AB, zero);
-                tmp1 = _m_pcmpeqd (left_right, zero);
-                tmp0 = _m_por (tmp0, tmp1);
-                tmp0 = _m_pandn (tmp0, delta);
-                sam_AB = _m_pxor (sam_AB, left_right);
-                sam_AB = _m_psradi (sam_AB, 31);
-                tmp1 = _m_psubd (fill, sam_AB);
-                weight_AB = _m_pxor (weight_AB, sam_AB);
-                weight_AB = _m_paddd (weight_AB, tmp1);
-                weight_AB = _m_paddsw (weight_AB, tmp0);
-                weight_AB = _m_psubd (weight_AB, tmp1);
-                weight_AB = _m_pxor (weight_AB, sam_AB);
-
-                sum_AB = _m_paddd (sum_AB, weight_AB);
-
-                in_samples += dir;
-                out_samples += dir;
-            }
-        }
-        else if (dpp->term == -2) {
-            while (num_samples--) {
-                left_right = *(__m64 *) in_samples;
-                sam_AB = samples_AB [0];
-                samples_AB [0] = _m_punpckldq (sam_AB, left_right);
-                sam_AB = _m_punpckhdq (left_right, sam_AB);
-
-                tmp0 = _m_paddd (sam_AB, sam_AB);
-                tmp1 = _m_pand (sam_AB, mask);
-                tmp0 = _m_psrldi (tmp0, 16);
-                tmp1 = _m_pmaddwd (tmp1, weight_AB);
-                tmp0 = _m_pmaddwd (tmp0, weight_AB);
-                tmp1 = _m_paddd (tmp1, round);
-                tmp0 = _m_pslldi (tmp0, 5);
-                tmp1 = _m_psradi (tmp1, 10);
-                left_right = _m_psubd (left_right, tmp0);
-                left_right = _m_psubd (left_right, tmp1);
-
-                *(__m64 *) out_samples = left_right;
-
-                tmp0 = _m_pcmpeqd (sam_AB, zero);
-                tmp1 = _m_pcmpeqd (left_right, zero);
-                tmp0 = _m_por (tmp0, tmp1);
-                tmp0 = _m_pandn (tmp0, delta);
-                sam_AB = _m_pxor (sam_AB, left_right);
-                sam_AB = _m_psradi (sam_AB, 31);
-                tmp1 = _m_psubd (fill, sam_AB);
-                weight_AB = _m_pxor (weight_AB, sam_AB);
-                weight_AB = _m_paddd (weight_AB, tmp1);
-                weight_AB = _m_paddsw (weight_AB, tmp0);
-                weight_AB = _m_psubd (weight_AB, tmp1);
-                weight_AB = _m_pxor (weight_AB, sam_AB);
-
-                sum_AB = _m_paddd (sum_AB, weight_AB);
-
-                in_samples += dir;
-                out_samples += dir;
-            }
-        }
-        else if (dpp->term == -3) {
-            while (num_samples--) {
-                left_right = *(__m64 *) in_samples;
-                sam_AB = samples_AB [0];
-                tmp0 = _m_punpckhdq (left_right, left_right);
-                samples_AB [0] = _m_punpckldq (tmp0, left_right);
-
-                tmp0 = _m_paddd (sam_AB, sam_AB);
-                tmp1 = _m_pand (sam_AB, mask);
-                tmp0 = _m_psrldi (tmp0, 16);
-                tmp1 = _m_pmaddwd (tmp1, weight_AB);
-                tmp0 = _m_pmaddwd (tmp0, weight_AB);
-                tmp1 = _m_paddd (tmp1, round);
-                tmp0 = _m_pslldi (tmp0, 5);
-                tmp1 = _m_psradi (tmp1, 10);
-                left_right = _m_psubd (left_right, tmp0);
-                left_right = _m_psubd (left_right, tmp1);
-
-                *(__m64 *) out_samples = left_right;
-
-                tmp0 = _m_pcmpeqd (sam_AB, zero);
-                tmp1 = _m_pcmpeqd (left_right, zero);
-                tmp0 = _m_por (tmp0, tmp1);
-                tmp0 = _m_pandn (tmp0, delta);
-                sam_AB = _m_pxor (sam_AB, left_right);
-                sam_AB = _m_psradi (sam_AB, 31);
-                tmp1 = _m_psubd (fill, sam_AB);
-                weight_AB = _m_pxor (weight_AB, sam_AB);
-                weight_AB = _m_paddd (weight_AB, tmp1);
-                weight_AB = _m_paddsw (weight_AB, tmp0);
-                weight_AB = _m_psubd (weight_AB, tmp1);
-                weight_AB = _m_pxor (weight_AB, sam_AB);
-
-                sum_AB = _m_paddd (sum_AB, weight_AB);
-
-                in_samples += dir;
-                out_samples += dir;
-            }
-        }
-    }
-    dpp->sum_A = ((int32_t *) &sum_AB) [0];
-    dpp->sum_B = ((int32_t *) &sum_AB) [1];
-    dpp->weight_A = ((int32_t *) &weight_AB) [0];
-    dpp->weight_B = ((int32_t *) &weight_AB) [1];
-
-    for (k = 0; k < MAX_TERM; ++k) {
-        dpp->samples_A [k] = ((int32_t *) samples_AB) [m * 2];
-        dpp->samples_B [k] = ((int32_t *) samples_AB) [m * 2 + 1];
-        m = (m + 1) & (MAX_TERM - 1);
-    }
-    _mm_empty ();
-}
-
-#else
-
 static void decorr_stereo_pass (int32_t *in_samples, int32_t *out_samples, int32_t num_samples, struct decorr_pass *dpp, int dir)
 {
+    int32_t cont_samples = 0;
     int m = 0, i;
 
+#ifdef PACK_DECORR_STEREO_PASS_CONT
+    if (num_samples > 16 && PACK_DECORR_STEREO_PASS_CONT_AVAILABLE) {
+        int32_t pre_samples = (dpp->term < 0 || dpp->term > MAX_TERM) ? 2 : dpp->term;
+        cont_samples = num_samples - pre_samples;
+        num_samples = pre_samples;
+    }
+#endif
+
     dpp->sum_A = dpp->sum_B = 0;
 
     if (dir < 0) {
-        out_samples += (num_samples - 1) * 2;
-        in_samples += (num_samples - 1) * 2;
+        out_samples += (num_samples + cont_samples - 1) * 2;
+        in_samples += (num_samples + cont_samples - 1) * 2;
         dir = -2;
     }
     else
@@ -339,8 +88,8 @@ static void decorr_stereo_pass (int32_t *in_samples, int32_t *out_samples, int32
     dpp->weight_B = restore_weight (store_weight (dpp->weight_B));
 
     for (i = 0; i < 8; ++i) {
-        dpp->samples_A [i] = exp2s (log2s (dpp->samples_A [i]));
-        dpp->samples_B [i] = exp2s (log2s (dpp->samples_B [i]));
+        dpp->samples_A [i] = wp_exp2s (wp_log2s (dpp->samples_A [i]));
+        dpp->samples_B [i] = wp_exp2s (wp_log2s (dpp->samples_B [i]));
     }
 
     switch (dpp->term) {
@@ -511,184 +260,15 @@ static void decorr_stereo_pass (int32_t *in_samples, int32_t *out_samples, int32
 
             break;
     }
-}
 
+#ifdef PACK_DECORR_STEREO_PASS_CONT
+    if (cont_samples) {
+        if (dir < 0)
+            PACK_DECORR_STEREO_PASS_CONT_REV (dpp, in_samples, out_samples, cont_samples);
+        else
+            PACK_DECORR_STEREO_PASS_CONT (dpp, in_samples, out_samples, cont_samples);
+    }
 #endif
-
-static void decorr_stereo_pass_quick (int32_t *in_samples, int32_t *out_samples, int32_t num_samples, struct decorr_pass *dpp, int dir)
-{
-    int m = 0, i;
-
-    if (dir < 0) {
-        out_samples += (num_samples - 1) * 2;
-        in_samples += (num_samples - 1) * 2;
-        dir = -2;
-    }
-    else
-        dir = 2;
-
-    dpp->weight_A = restore_weight (store_weight (dpp->weight_A));
-    dpp->weight_B = restore_weight (store_weight (dpp->weight_B));
-
-    for (i = 0; i < 8; ++i) {
-        dpp->samples_A [i] = exp2s (log2s (dpp->samples_A [i]));
-        dpp->samples_B [i] = exp2s (log2s (dpp->samples_B [i]));
-    }
-
-    switch (dpp->term) {
-
-        case 2:
-            while (num_samples--) {
-                int32_t sam, tmp;
-
-                sam = dpp->samples_A [0];
-                dpp->samples_A [0] = dpp->samples_A [1];
-                out_samples [0] = tmp = (dpp->samples_A [1] = in_samples [0]) - apply_weight_i (dpp->weight_A, sam);
-                update_weight (dpp->weight_A, dpp->delta, sam, tmp);
-
-                sam = dpp->samples_B [0];
-                dpp->samples_B [0] = dpp->samples_B [1];
-                out_samples [1] = tmp = (dpp->samples_B [1] = in_samples [1]) - apply_weight_i (dpp->weight_B, sam);
-                update_weight (dpp->weight_B, dpp->delta, sam, tmp);
-
-                in_samples += dir;
-                out_samples += dir;
-            }
-
-            break;
-
-        case 17:
-            while (num_samples--) {
-                int32_t sam, tmp;
-
-                sam = 2 * dpp->samples_A [0] - dpp->samples_A [1];
-                dpp->samples_A [1] = dpp->samples_A [0];
-                out_samples [0] = tmp = (dpp->samples_A [0] = in_samples [0]) - apply_weight_i (dpp->weight_A, sam);
-                update_weight (dpp->weight_A, dpp->delta, sam, tmp);
-
-                sam = 2 * dpp->samples_B [0] - dpp->samples_B [1];
-                dpp->samples_B [1] = dpp->samples_B [0];
-                out_samples [1] = tmp = (dpp->samples_B [0] = in_samples [1]) - apply_weight_i (dpp->weight_B, sam);
-                update_weight (dpp->weight_B, dpp->delta, sam, tmp);
-
-                in_samples += dir;
-                out_samples += dir;
-            }
-
-            break;
-
-        case 18:
-            while (num_samples--) {
-                int32_t sam, tmp;
-
-                sam = dpp->samples_A [0] + ((dpp->samples_A [0] - dpp->samples_A [1]) >> 1);
-                dpp->samples_A [1] = dpp->samples_A [0];
-                out_samples [0] = tmp = (dpp->samples_A [0] = in_samples [0]) - apply_weight_i (dpp->weight_A, sam);
-                update_weight (dpp->weight_A, dpp->delta, sam, tmp);
-
-                sam = dpp->samples_B [0] + ((dpp->samples_B [0] - dpp->samples_B [1]) >> 1);
-                dpp->samples_B [1] = dpp->samples_B [0];
-                out_samples [1] = tmp = (dpp->samples_B [0] = in_samples [1]) - apply_weight_i (dpp->weight_B, sam);
-                update_weight (dpp->weight_B, dpp->delta, sam, tmp);
-
-                in_samples += dir;
-                out_samples += dir;
-            }
-
-            break;
-
-        default: {
-            int k = dpp->term & (MAX_TERM - 1);
-
-            while (num_samples--) {
-                int32_t sam, tmp;
-
-                sam = dpp->samples_A [m];
-                out_samples [0] = tmp = (dpp->samples_A [k] = in_samples [0]) - apply_weight_i (dpp->weight_A, sam);
-                update_weight (dpp->weight_A, dpp->delta, sam, tmp);
-
-                sam = dpp->samples_B [m];
-                out_samples [1] = tmp = (dpp->samples_B [k] = in_samples [1]) - apply_weight_i (dpp->weight_B, sam);
-                update_weight (dpp->weight_B, dpp->delta, sam, tmp);
-
-                in_samples += dir;
-                out_samples += dir;
-                m = (m + 1) & (MAX_TERM - 1);
-                k = (k + 1) & (MAX_TERM - 1);
-            }
-
-            if (m) {
-                int32_t temp_A [MAX_TERM], temp_B [MAX_TERM];
-                int k;
-
-                memcpy (temp_A, dpp->samples_A, sizeof (dpp->samples_A));
-                memcpy (temp_B, dpp->samples_B, sizeof (dpp->samples_B));
-
-                for (k = 0; k < MAX_TERM; k++) {
-                    dpp->samples_A [k] = temp_A [m];
-                    dpp->samples_B [k] = temp_B [m];
-                    m = (m + 1) & (MAX_TERM - 1);
-                }
-            }
-
-            break;
-        }
-
-        case -1:
-            while (num_samples--) {
-                int32_t sam_A, sam_B, tmp;
-
-                sam_A = dpp->samples_A [0];
-                out_samples [0] = tmp = (sam_B = in_samples [0]) - apply_weight_i (dpp->weight_A, sam_A);
-                update_weight_clip (dpp->weight_A, dpp->delta, sam_A, tmp);
-
-                out_samples [1] = tmp = (dpp->samples_A [0] = in_samples [1]) - apply_weight_i (dpp->weight_B, sam_B);
-                update_weight_clip (dpp->weight_B, dpp->delta, sam_B, tmp);
-
-                in_samples += dir;
-                out_samples += dir;
-            }
-
-            break;
-
-        case -2:
-            while (num_samples--) {
-                int32_t sam_A, sam_B, tmp;
-
-                sam_B = dpp->samples_B [0];
-                out_samples [1] = tmp = (sam_A = in_samples [1]) - apply_weight_i (dpp->weight_B, sam_B);
-                update_weight_clip (dpp->weight_B, dpp->delta, sam_B, tmp);
-
-                out_samples [0] = tmp = (dpp->samples_B [0] = in_samples [0]) - apply_weight_i (dpp->weight_A, sam_A);
-                update_weight_clip (dpp->weight_A, dpp->delta, sam_A, tmp);
-
-                in_samples += dir;
-                out_samples += dir;
-            }
-
-            break;
-
-        case -3:
-            while (num_samples--) {
-                int32_t sam_A, sam_B, tmp;
-
-                sam_A = dpp->samples_A [0];
-                sam_B = dpp->samples_B [0];
-
-                dpp->samples_A [0] = tmp = in_samples [1];
-                out_samples [1] = tmp -= apply_weight_i (dpp->weight_B, sam_B);
-                update_weight_clip (dpp->weight_B, dpp->delta, sam_B, tmp);
-
-                dpp->samples_B [0] = tmp = in_samples [0];
-                out_samples [0] = tmp -= apply_weight_i (dpp->weight_A, sam_A);
-                update_weight_clip (dpp->weight_A, dpp->delta, sam_A, tmp);
-
-                in_samples += dir;
-                out_samples += dir;
-            }
-
-            break;
-    }
 }
 
 static void reverse_decorr (struct decorr_pass *dpp)
@@ -788,10 +368,7 @@ static void decorr_stereo_buffer (WavpackExtraInfo *info, int32_t *samples, int3
 //    if (memcmp (dppi, &dp, sizeof (dp)))
 //      error_line ("decorr_passes don't match, delta = %d", delta);
 
-    if (info->gt16bit)
-        decorr_stereo_pass (samples, outsamples, num_samples, &dp, 1);
-    else
-        decorr_stereo_pass_quick (samples, outsamples, num_samples, &dp, 1);
+    decorr_stereo_pass (samples, outsamples, num_samples, &dp, 1);
 }
 
 static int log2overhead (int first_term, int num_terms)
@@ -837,7 +414,7 @@ static void recurse_stereo (WavpackContext *wpc, WavpackExtraInfo *info, int dep
         info->dps [depth].term = term;
         info->dps [depth].delta = delta;
         decorr_stereo_buffer (info, samples, outsamples, wps->wphdr.block_samples, depth);
-        bits = log2buffer (outsamples, wps->wphdr.block_samples * 2, info->log_limit);
+        bits = LOG2BUFFER (outsamples, wps->wphdr.block_samples * 2, info->log_limit);
 
         if (bits != (uint32_t) -1)
             bits += log2overhead (info->dps [0].term, depth + 1);
@@ -903,7 +480,7 @@ static void delta_stereo (WavpackContext *wpc, WavpackExtraInfo *info)
             decorr_stereo_buffer (info, info->sampleptrs [i], info->sampleptrs [i+1], wps->wphdr.block_samples, i);
         }
 
-        bits = log2buffer (info->sampleptrs [i], wps->wphdr.block_samples * 2, info->log_limit);
+        bits = LOG2BUFFER (info->sampleptrs [i], wps->wphdr.block_samples * 2, info->log_limit);
 
         if (bits != (uint32_t) -1)
             bits += log2overhead (wps->decorr_passes [0].term, i);
@@ -928,7 +505,7 @@ static void delta_stereo (WavpackContext *wpc, WavpackExtraInfo *info)
             decorr_stereo_buffer (info, info->sampleptrs [i], info->sampleptrs [i+1], wps->wphdr.block_samples, i);
         }
 
-        bits = log2buffer (info->sampleptrs [i], wps->wphdr.block_samples * 2, info->log_limit);
+        bits = LOG2BUFFER (info->sampleptrs [i], wps->wphdr.block_samples * 2, info->log_limit);
 
         if (bits != (uint32_t) -1)
             bits += log2overhead (wps->decorr_passes [0].term, i);
@@ -972,7 +549,7 @@ static void sort_stereo (WavpackContext *wpc, WavpackExtraInfo *info)
             for (i = ri; i < info->nterms && wps->decorr_passes [i].term; ++i)
                 decorr_stereo_buffer (info, info->sampleptrs [i], info->sampleptrs [i+1], wps->wphdr.block_samples, i);
 
-            bits = log2buffer (info->sampleptrs [i], wps->wphdr.block_samples * 2, info->log_limit);
+            bits = LOG2BUFFER (info->sampleptrs [i], wps->wphdr.block_samples * 2, info->log_limit);
 
             if (bits != (uint32_t) -1)
                 bits += log2overhead (wps->decorr_passes [0].term, i);
@@ -1001,8 +578,6 @@ static void analyze_stereo (WavpackContext *wpc, int32_t *samples, int do_sample
     WavpackExtraInfo info;
     int i;
 
-    info.gt16bit = ((wps->wphdr.flags & MAG_MASK) >> MAG_LSB) >= 16;
-
 #ifdef LOG_LIMIT
     info.log_limit = (((wps->wphdr.flags & MAG_MASK) >> MAG_LSB) + 4) * 256;
 
@@ -1026,18 +601,15 @@ static void analyze_stereo (WavpackContext *wpc, int32_t *samples, int do_sample
     memcpy (info.sampleptrs [0], samples, wps->wphdr.block_samples * 8);
 
     for (i = 0; i < info.nterms && info.dps [i].term; ++i)
-        if (info.gt16bit)
-            decorr_stereo_pass (info.sampleptrs [i], info.sampleptrs [i + 1], wps->wphdr.block_samples, info.dps + i, 1);
-        else
-            decorr_stereo_pass_quick (info.sampleptrs [i], info.sampleptrs [i + 1], wps->wphdr.block_samples, info.dps + i, 1);
+        decorr_stereo_pass (info.sampleptrs [i], info.sampleptrs [i + 1], wps->wphdr.block_samples, info.dps + i, 1);
 
-    info.best_bits = log2buffer (info.sampleptrs [info.nterms], wps->wphdr.block_samples * 2, 0) * 1;
+    info.best_bits = LOG2BUFFER (info.sampleptrs [info.nterms], wps->wphdr.block_samples * 2, 0) * 1;
     info.best_bits += log2overhead (info.dps [0].term, i);
     memcpy (info.sampleptrs [info.nterms + 1], info.sampleptrs [i], wps->wphdr.block_samples * 8);
 
     if (wpc->config.extra_flags & EXTRA_BRANCHES)
         recurse_stereo (wpc, &info, 0, (int) floor (wps->delta_decay + 0.5),
-            log2buffer (info.sampleptrs [0], wps->wphdr.block_samples * 2, 0));
+            LOG2BUFFER (info.sampleptrs [0], wps->wphdr.block_samples * 2, 0));
 
     if (wpc->config.extra_flags & EXTRA_SORT_FIRST)
         sort_stereo (wpc, &info);
@@ -1137,6 +709,12 @@ void execute_stereo (WavpackContext *wpc, int32_t *samples, int no_history, int
     uint32_t best_size = (uint32_t) -1, size;
     int log_limit, force_js = 0, force_ts = 0, pi, i;
 
+#ifdef SKIP_DECORRELATION
+    CLEAR (wps->decorr_passes);
+    wps->num_terms = 0;
+    return;
+#endif
+
     for (i = 0; i < num_samples * 2; ++i)
         if (samples [i])
             break;
@@ -1216,7 +794,7 @@ void execute_stereo (WavpackContext *wpc, int32_t *samples, int no_history, int
         }
 
         wpds = &wps->decorr_specs [c];
-        nterms = (int) strlen (wpds->terms);
+        nterms = (int) strlen ((char *) wpds->terms);
 
         while (1) {
             if (force_js || (wpds->joint_stereo && !force_ts)) {
@@ -1258,14 +836,10 @@ void execute_stereo (WavpackContext *wpc, int32_t *samples, int no_history, int
                     reverse_decorr (&temp_decorr_pass);
 
                 memcpy (save_decorr_passes + j, &temp_decorr_pass, sizeof (struct decorr_pass));
-
-                if (((wps->wphdr.flags & MAG_MASK) >> MAG_LSB) >= 16)
-                    decorr_stereo_pass (temp_buffer [j&1], temp_buffer [~j&1], num_samples, &temp_decorr_pass, 1);
-                else
-                    decorr_stereo_pass_quick (temp_buffer [j&1], temp_buffer [~j&1], num_samples, &temp_decorr_pass, 1);
+                decorr_stereo_pass (temp_buffer [j&1], temp_buffer [~j&1], num_samples, &temp_decorr_pass, 1);
             }
 
-            size = log2buffer (temp_buffer [j&1], num_samples * 2, log_limit);
+            size = LOG2BUFFER (temp_buffer [j&1], num_samples * 2, log_limit);
 
             if (size == (uint32_t) -1 && nterms)
                 nterms >>= 1;
diff --git a/third_party/wavpack/src/float.c b/third_party/wavpack/src/float.c
deleted file mode 100644
index a01cfb3..0000000
--- a/third_party/wavpack/src/float.c
+++ /dev/null
@@ -1,371 +0,0 @@
-////////////////////////////////////////////////////////////////////////////
-//                           **** WAVPACK ****                            //
-//                  Hybrid Lossless Wavefile Compressor                   //
-//              Copyright (c) 1998 - 2006 Conifer Software.               //
-//                          All Rights Reserved.                          //
-//      Distributed under the BSD Software License (see license.txt)      //
-////////////////////////////////////////////////////////////////////////////
-
-// float.c
-
-#include "wavpack_local.h"
-
-#include <stdlib.h>
-
-#ifdef DEBUG_ALLOC
-#define malloc malloc_db
-#define realloc realloc_db
-#define free free_db
-void *malloc_db (uint32_t size);
-void *realloc_db (void *ptr, uint32_t size);
-void free_db (void *ptr);
-int32_t dump_alloc (void);
-#endif
-
-#ifndef NO_PACK
-
-void write_float_info (WavpackStream *wps, WavpackMetadata *wpmd)
-{
-    char *byteptr;
-
-    byteptr = wpmd->data = malloc (4);
-    wpmd->id = ID_FLOAT_INFO;
-    *byteptr++ = wps->float_flags;
-    *byteptr++ = wps->float_shift;
-    *byteptr++ = wps->float_max_exp;
-    *byteptr++ = wps->float_norm_exp;
-    wpmd->byte_length = (int32_t)(byteptr - (char *) wpmd->data);
-}
-
-int scan_float_data (WavpackStream *wps, f32 *values, int32_t num_values)
-{
-    int32_t shifted_ones = 0, shifted_zeros = 0, shifted_both = 0;
-    int32_t false_zeros = 0, neg_zeros = 0;
-    uint32_t ordata = 0, crc = 0xffffffff;
-    int32_t count, value, shift_count;
-    int max_exp = 0;
-    f32 *dp;
-
-    wps->float_shift = wps->float_flags = 0;
-
-    for (dp = values, count = num_values; count--; dp++) {
-        crc = crc * 27 + get_mantissa (*dp) * 9 + get_exponent (*dp) * 3 + get_sign (*dp);
-
-        if (get_exponent (*dp) > max_exp && get_exponent (*dp) < 255)
-            max_exp = get_exponent (*dp);
-    }
-
-    wps->crc_x = crc;
-
-    for (dp = values, count = num_values; count--; dp++) {
-        if (get_exponent (*dp) == 255) {
-            wps->float_flags |= FLOAT_EXCEPTIONS;
-            value = 0x1000000;
-            shift_count = 0;
-        }
-        else if (get_exponent (*dp)) {
-            shift_count = max_exp - get_exponent (*dp);
-            value = 0x800000 + get_mantissa (*dp);
-        }
-        else {
-            shift_count = max_exp ? max_exp - 1 : 0;
-            value = get_mantissa (*dp);
-
-//          if (get_mantissa (*dp))
-//              denormals++;
-        }
-
-        if (shift_count < 25)
-            value >>= shift_count;
-        else
-            value = 0;
-
-        if (!value) {
-            if (get_exponent (*dp) || get_mantissa (*dp))
-                ++false_zeros;
-            else if (get_sign (*dp))
-                ++neg_zeros;
-        }
-        else if (shift_count) {
-            int32_t mask = (1 << shift_count) - 1;
-
-            if (!(get_mantissa (*dp) & mask))
-                shifted_zeros++;
-            else if ((get_mantissa (*dp) & mask) == mask)
-                shifted_ones++;
-            else
-                shifted_both++;
-        }
-
-        ordata |= value;
-        * (int32_t *) dp = (get_sign (*dp)) ? -value : value;
-    }
-
-    wps->float_max_exp = max_exp;
-
-    if (shifted_both)
-        wps->float_flags |= FLOAT_SHIFT_SENT;
-    else if (shifted_ones && !shifted_zeros)
-        wps->float_flags |= FLOAT_SHIFT_ONES;
-    else if (shifted_ones && shifted_zeros)
-        wps->float_flags |= FLOAT_SHIFT_SAME;
-    else if (ordata && !(ordata & 1)) {
-        while (!(ordata & 1)) {
-            wps->float_shift++;
-            ordata >>= 1;
-        }
-
-        for (dp = values, count = num_values; count--; dp++)
-            * (int32_t *) dp >>= wps->float_shift;
-    }
-
-    wps->wphdr.flags &= ~MAG_MASK;
-
-    while (ordata) {
-        wps->wphdr.flags += 1 << MAG_LSB;
-        ordata >>= 1;
-    }
-
-    if (false_zeros || neg_zeros)
-        wps->float_flags |= FLOAT_ZEROS_SENT;
-
-    if (neg_zeros)
-        wps->float_flags |= FLOAT_NEG_ZEROS;
-
-//  error_line ("samples = %d, max exp = %d, pre-shift = %d, denormals = %d",
-//      num_values, max_exp, wps->float_shift, denormals);
-//  if (wps->float_flags & FLOAT_EXCEPTIONS)
-//      error_line ("exceptions!");
-//  error_line ("shifted ones/zeros/both = %d/%d/%d, true/neg/false zeros = %d/%d/%d",
-//      shifted_ones, shifted_zeros, shifted_both, true_zeros, neg_zeros, false_zeros);
-
-    return wps->float_flags & (FLOAT_EXCEPTIONS | FLOAT_ZEROS_SENT | FLOAT_SHIFT_SENT | FLOAT_SHIFT_SAME);
-}
-
-void send_float_data (WavpackStream *wps, f32 *values, int32_t num_values)
-{
-    int max_exp = wps->float_max_exp;
-    int32_t count, value, shift_count;
-    f32 *dp;
-
-    for (dp = values, count = num_values; count--; dp++) {
-        if (get_exponent (*dp) == 255) {
-            if (get_mantissa (*dp)) {
-                putbit_1 (&wps->wvxbits);
-                putbits (get_mantissa (*dp), 23, &wps->wvxbits);
-            }
-            else {
-                putbit_0 (&wps->wvxbits);
-            }
-
-            value = 0x1000000;
-            shift_count = 0;
-        }
-        else if (get_exponent (*dp)) {
-            shift_count = max_exp - get_exponent (*dp);
-            value = 0x800000 + get_mantissa (*dp);
-        }
-        else {
-            shift_count = max_exp ? max_exp - 1 : 0;
-            value = get_mantissa (*dp);
-        }
-
-        if (shift_count < 25)
-            value >>= shift_count;
-        else
-            value = 0;
-
-        if (!value) {
-            if (wps->float_flags & FLOAT_ZEROS_SENT) {
-                if (get_exponent (*dp) || get_mantissa (*dp)) {
-                    putbit_1 (&wps->wvxbits);
-                    putbits (get_mantissa (*dp), 23, &wps->wvxbits);
-
-                    if (max_exp >= 25) {
-                        putbits (get_exponent (*dp), 8, &wps->wvxbits);
-                    }
-
-                    putbit (get_sign (*dp), &wps->wvxbits);
-                }
-                else {
-                    putbit_0 (&wps->wvxbits);
-
-                    if (wps->float_flags & FLOAT_NEG_ZEROS)
-                        putbit (get_sign (*dp), &wps->wvxbits);
-                }
-            }
-        }
-        else if (shift_count) {
-            if (wps->float_flags & FLOAT_SHIFT_SENT) {
-                int32_t data = get_mantissa (*dp) & ((1 << shift_count) - 1);
-                putbits (data, shift_count, &wps->wvxbits);
-            }
-            else if (wps->float_flags & FLOAT_SHIFT_SAME) {
-                putbit (get_mantissa (*dp) & 1, &wps->wvxbits);
-            }
-        }
-    }
-}
-
-#endif
-
-#if !defined(NO_UNPACK) || defined(INFO_ONLY)
-
-int read_float_info (WavpackStream *wps, WavpackMetadata *wpmd)
-{
-    int bytecnt = wpmd->byte_length;
-    char *byteptr = wpmd->data;
-
-    if (bytecnt != 4)
-        return FALSE;
-
-    wps->float_flags = *byteptr++;
-    wps->float_shift = *byteptr++;
-    wps->float_max_exp = *byteptr++;
-    wps->float_norm_exp = *byteptr;
-    return TRUE;
-}
-
-#endif
-
-#ifndef NO_UNPACK
-
-static void float_values_nowvx (WavpackStream *wps, int32_t *values, int32_t num_values);
-
-void float_values (WavpackStream *wps, int32_t *values, int32_t num_values)
-{
-    uint32_t crc = wps->crc_x;
-
-    if (!bs_is_open (&wps->wvxbits)) {
-        float_values_nowvx (wps, values, num_values);
-        return;
-    }
-
-    while (num_values--) {
-        int shift_count = 0, exp = wps->float_max_exp;
-        f32 outval = 0;
-        uint32_t temp;
-
-        if (*values == 0) {
-            if (wps->float_flags & FLOAT_ZEROS_SENT) {
-                if (getbit (&wps->wvxbits)) {
-                    getbits (&temp, 23, &wps->wvxbits);
-                    set_mantissa (outval, temp);
-
-                    if (exp >= 25) {
-                        getbits (&temp, 8, &wps->wvxbits);
-                        set_exponent (outval, temp);
-                    }
-
-                    set_sign (outval, getbit (&wps->wvxbits));
-                }
-                else if (wps->float_flags & FLOAT_NEG_ZEROS)
-                    set_sign (outval, getbit (&wps->wvxbits));
-            }
-        }
-        else {
-            *values <<= wps->float_shift;
-
-            if (*values < 0) {
-                *values = -*values;
-                set_sign (outval, 1);
-            }
-
-            if (*values == 0x1000000) {
-                if (getbit (&wps->wvxbits)) {
-                    getbits (&temp, 23, &wps->wvxbits);
-                    set_mantissa (outval, temp);
-                }
-
-                set_exponent (outval, 255);
-            }
-            else {
-                if (exp)
-                    while (!(*values & 0x800000) && --exp) {
-                        shift_count++;
-                        *values <<= 1;
-                    }
-
-                if (shift_count) {
-                    if ((wps->float_flags & FLOAT_SHIFT_ONES) ||
-                        ((wps->float_flags & FLOAT_SHIFT_SAME) && getbit (&wps->wvxbits)))
-                            *values |= ((1 << shift_count) - 1);
-                    else if (wps->float_flags & FLOAT_SHIFT_SENT) {
-                        getbits (&temp, shift_count, &wps->wvxbits);
-                        *values |= temp & ((1 << shift_count) - 1);
-                    }
-                }
-
-                set_mantissa (outval, *values);
-                set_exponent (outval, exp);
-            }
-        }
-
-        crc = crc * 27 + get_mantissa (outval) * 9 + get_exponent (outval) * 3 + get_sign (outval);
-        * (f32 *) values++ = outval;
-    }
-
-    wps->crc_x = crc;
-}
-
-static void float_values_nowvx (WavpackStream *wps, int32_t *values, int32_t num_values)
-{
-    while (num_values--) {
-        int shift_count = 0, exp = wps->float_max_exp;
-        f32 outval = 0;
-
-        if (*values) {
-            *values <<= wps->float_shift;
-
-            if (*values < 0) {
-                *values = -*values;
-                set_sign (outval, 1);
-            }
-
-            if (*values >= 0x1000000) {
-                while (*values & 0xf000000) {
-                    *values >>= 1;
-                    ++exp;
-                }
-            }
-            else if (exp) {
-                while (!(*values & 0x800000) && --exp) {
-                    shift_count++;
-                    *values <<= 1;
-                }
-
-                if (shift_count && (wps->float_flags & FLOAT_SHIFT_ONES))
-                    *values |= ((1 << shift_count) - 1);
-            }
-
-            set_mantissa (outval, *values);
-            set_exponent (outval, exp);
-        }
-
-        * (f32 *) values++ = outval;
-    }
-}
-
-#endif
-
-void WavpackFloatNormalize (int32_t *values, int32_t num_values, int delta_exp)
-{
-    f32 *fvalues = (f32 *) values;
-    int exp;
-
-    if (!delta_exp)
-        return;
-
-    while (num_values--) {
-        if ((exp = get_exponent (*fvalues)) == 0 || exp + delta_exp <= 0)
-            *fvalues = 0;
-        else if (exp == 255 || (exp += delta_exp) >= 255) {
-            set_exponent (*fvalues, 255);
-            set_mantissa (*fvalues, 0);
-        }
-        else
-            set_exponent (*fvalues, exp);
-
-        fvalues++;
-    }
-}
diff --git a/third_party/wavpack/src/metadata.c b/third_party/wavpack/src/metadata.c
deleted file mode 100644
index 0fd8a49..0000000
--- a/third_party/wavpack/src/metadata.c
+++ /dev/null
@@ -1,313 +0,0 @@
-////////////////////////////////////////////////////////////////////////////
-//                           **** WAVPACK ****                            //
-//                  Hybrid Lossless Wavefile Compressor                   //
-//              Copyright (c) 1998 - 2006 Conifer Software.               //
-//                          All Rights Reserved.                          //
-//      Distributed under the BSD Software License (see license.txt)      //
-////////////////////////////////////////////////////////////////////////////
-
-// metadata.c
-
-// This module handles the metadata structure introduced in WavPack 4.0
-
-#include "wavpack_local.h"
-
-#include <stdlib.h>
-#include <string.h>
-
-#ifdef DEBUG_ALLOC
-#define malloc malloc_db
-#define realloc realloc_db
-#define free free_db
-void *malloc_db (uint32_t size);
-void *realloc_db (void *ptr, uint32_t size);
-void free_db (void *ptr);
-int32_t dump_alloc (void);
-#endif
-
-#if !defined(NO_UNPACK) || defined(INFO_ONLY)
-
-int read_metadata_buff (WavpackMetadata *wpmd, unsigned char *blockbuff, unsigned char **buffptr)
-{
-    WavpackHeader *wphdr = (WavpackHeader *) blockbuff;
-    unsigned char *buffend = blockbuff + wphdr->ckSize + 8;
-
-    if (buffend - *buffptr < 2)
-        return FALSE;
-
-    wpmd->id = *(*buffptr)++;
-    wpmd->byte_length = *(*buffptr)++ << 1;
-
-    if (wpmd->id & ID_LARGE) {
-        wpmd->id &= ~ID_LARGE;
-
-        if (buffend - *buffptr < 2)
-            return FALSE;
-
-        wpmd->byte_length += *(*buffptr)++ << 9;
-        wpmd->byte_length += *(*buffptr)++ << 17;
-    }
-
-    if (wpmd->id & ID_ODD_SIZE) {
-        wpmd->id &= ~ID_ODD_SIZE;
-        wpmd->byte_length--;
-    }
-
-    if (wpmd->byte_length) {
-        if (buffend - *buffptr < wpmd->byte_length + (wpmd->byte_length & 1)) {
-            wpmd->data = NULL;
-            return FALSE;
-        }
-
-        wpmd->data = *buffptr;
-        (*buffptr) += wpmd->byte_length + (wpmd->byte_length & 1);
-    }
-    else
-        wpmd->data = NULL;
-
-    return TRUE;
-}
-
-int process_metadata (WavpackContext *wpc, WavpackMetadata *wpmd)
-{
-    WavpackStream *wps = wpc->streams [wpc->current_stream];
-
-    switch (wpmd->id) {
-        case ID_DUMMY:
-            return TRUE;
-
-        case ID_DECORR_TERMS:
-            return read_decorr_terms (wps, wpmd);
-
-        case ID_DECORR_WEIGHTS:
-            return read_decorr_weights (wps, wpmd);
-
-        case ID_DECORR_SAMPLES:
-            return read_decorr_samples (wps, wpmd);
-
-        case ID_ENTROPY_VARS:
-            return read_entropy_vars (wps, wpmd);
-
-        case ID_HYBRID_PROFILE:
-            return read_hybrid_profile (wps, wpmd);
-
-        case ID_SHAPING_WEIGHTS:
-            return read_shaping_info (wps, wpmd);
-
-        case ID_FLOAT_INFO:
-            return read_float_info (wps, wpmd);
-
-        case ID_INT32_INFO:
-            return read_int32_info (wps, wpmd);
-
-        case ID_CHANNEL_INFO:
-            return read_channel_info (wpc, wpmd);
-
-        case ID_CONFIG_BLOCK:
-            return read_config_info (wpc, wpmd);
-
-        case ID_SAMPLE_RATE:
-            return read_sample_rate (wpc, wpmd);
-
-        case ID_WV_BITSTREAM:
-            return init_wv_bitstream (wps, wpmd);
-
-        case ID_WVC_BITSTREAM:
-            return init_wvc_bitstream (wps, wpmd);
-
-        case ID_WVX_BITSTREAM:
-            return init_wvx_bitstream (wps, wpmd);
-
-        case ID_RIFF_HEADER: case ID_RIFF_TRAILER:
-            return read_wrapper_data (wpc, wpmd);
-
-        case ID_MD5_CHECKSUM:
-            if (wpmd->byte_length == 16) {
-                memcpy (wpc->config.md5_checksum, wpmd->data, 16);
-                wpc->config.flags |= CONFIG_MD5_CHECKSUM;
-                wpc->config.md5_read = 1;
-            }
-
-            return TRUE;
-
-        default:
-            return (wpmd->id & ID_OPTIONAL_DATA) ? TRUE : FALSE;
-    }
-}
-
-#endif
-
-#ifndef NO_PACK
-
-int copy_metadata (WavpackMetadata *wpmd, unsigned char *buffer_start, unsigned char *buffer_end)
-{
-    uint32_t mdsize = wpmd->byte_length + (wpmd->byte_length & 1);
-    WavpackHeader *wphdr = (WavpackHeader *) buffer_start;
-
-    if (wpmd->byte_length & 1)
-        ((char *) wpmd->data) [wpmd->byte_length] = 0;
-
-    mdsize += (wpmd->byte_length > 510) ? 4 : 2;
-    buffer_start += wphdr->ckSize + 8;
-
-    if (buffer_start + mdsize >= buffer_end)
-        return FALSE;
-
-    buffer_start [0] = wpmd->id | (wpmd->byte_length & 1 ? ID_ODD_SIZE : 0);
-    buffer_start [1] = (wpmd->byte_length + 1) >> 1;
-
-    if (wpmd->byte_length > 510) {
-        buffer_start [0] |= ID_LARGE;
-        buffer_start [2] = (wpmd->byte_length + 1) >> 9;
-        buffer_start [3] = (wpmd->byte_length + 1) >> 17;
-    }
-
-    if (wpmd->data && wpmd->byte_length) {
-        if (wpmd->byte_length > 510) {
-            buffer_start [0] |= ID_LARGE;
-            buffer_start [2] = (wpmd->byte_length + 1) >> 9;
-            buffer_start [3] = (wpmd->byte_length + 1) >> 17;
-            memcpy (buffer_start + 4, wpmd->data, mdsize - 4);
-        }
-        else
-            memcpy (buffer_start + 2, wpmd->data, mdsize - 2);
-    }
-
-    wphdr->ckSize += mdsize;
-    return TRUE;
-}
-
-int add_to_metadata (WavpackContext *wpc, void *data, uint32_t bcount, unsigned char id)
-{
-    WavpackMetadata *mdp;
-    unsigned char *src = data;
-
-    while (bcount) {
-        if (wpc->metacount) {
-            uint32_t bc = bcount;
-
-            mdp = wpc->metadata + wpc->metacount - 1;
-
-            if (mdp->id == id) {
-                if (wpc->metabytes + bcount > 1000000)
-                    bc = 1000000 - wpc->metabytes;
-
-                mdp->data = realloc (mdp->data, mdp->byte_length + bc);
-                memcpy ((char *) mdp->data + mdp->byte_length, src, bc);
-                mdp->byte_length += bc;
-                wpc->metabytes += bc;
-                bcount -= bc;
-                src += bc;
-
-                if (wpc->metabytes >= 1000000 && !write_metadata_block (wpc))
-                    return FALSE;
-            }
-        }
-
-        if (bcount) {
-            wpc->metadata = realloc (wpc->metadata, (wpc->metacount + 1) * sizeof (WavpackMetadata));
-            mdp = wpc->metadata + wpc->metacount++;
-            mdp->byte_length = 0;
-            mdp->data = NULL;
-            mdp->id = id;
-        }
-    }
-
-    return TRUE;
-}
-
-static char *write_metadata (WavpackMetadata *wpmd, char *outdata)
-{
-    unsigned char id = wpmd->id, wordlen [3];
-
-    wordlen [0] = (wpmd->byte_length + 1) >> 1;
-    wordlen [1] = (wpmd->byte_length + 1) >> 9;
-    wordlen [2] = (wpmd->byte_length + 1) >> 17;
-
-    if (wpmd->byte_length & 1) {
-//      ((char *) wpmd->data) [wpmd->byte_length] = 0;
-        id |= ID_ODD_SIZE;
-    }
-
-    if (wordlen [1] || wordlen [2])
-        id |= ID_LARGE;
-
-    *outdata++ = id;
-    *outdata++ = wordlen [0];
-
-    if (id & ID_LARGE) {
-        *outdata++ = wordlen [1];
-        *outdata++ = wordlen [2];
-    }
-
-    if (wpmd->data && wpmd->byte_length) {
-        memcpy (outdata, wpmd->data, wpmd->byte_length);
-        outdata += wpmd->byte_length;
-
-        if (wpmd->byte_length & 1)
-            *outdata++ = 0;
-    }
-
-    return outdata;
-}
-
-int write_metadata_block (WavpackContext *wpc)
-{
-    char *block_buff, *block_ptr;
-    WavpackHeader *wphdr;
-
-    if (wpc->metacount) {
-        int metacount = wpc->metacount, block_size = sizeof (WavpackHeader);
-        WavpackMetadata *wpmdp = wpc->metadata;
-
-        while (metacount--) {
-            block_size += wpmdp->byte_length + (wpmdp->byte_length & 1);
-            block_size += (wpmdp->byte_length > 510) ? 4 : 2;
-            wpmdp++;
-        }
-
-        wphdr = (WavpackHeader *) (block_buff = malloc (block_size));
-
-        CLEAR (*wphdr);
-        memcpy (wphdr->ckID, "wvpk", 4);
-        wphdr->total_samples = wpc->total_samples;
-        wphdr->version = wpc->stream_version;
-        wphdr->ckSize = block_size - 8;
-        wphdr->block_samples = 0;
-
-        block_ptr = (char *)(wphdr + 1);
-
-        wpmdp = wpc->metadata;
-
-        while (wpc->metacount) {
-            block_ptr = write_metadata (wpmdp, block_ptr);
-            wpc->metabytes -= wpmdp->byte_length;
-            free_metadata (wpmdp++);
-            wpc->metacount--;
-        }
-
-        free (wpc->metadata);
-        wpc->metadata = NULL;
-        native_to_little_endian ((WavpackHeader *) block_buff, WavpackHeaderFormat);
-
-        if (!wpc->blockout (wpc->wv_out, block_buff, block_size)) {
-            free (block_buff);
-            strcpy (wpc->error_message, "can't write WavPack data, disk probably full!");
-            return FALSE;
-        }
-
-        free (block_buff);
-    }
-
-    return TRUE;
-}
-
-#endif
-
-void free_metadata (WavpackMetadata *wpmd)
-{
-    if (wpmd->data) {
-        free (wpmd->data);
-        wpmd->data = NULL;
-    }
-}
diff --git a/third_party/wavpack/src/open_filename.c b/third_party/wavpack/src/open_filename.c
new file mode 100644
index 0000000..4c74e67
--- /dev/null
+++ b/third_party/wavpack/src/open_filename.c
@@ -0,0 +1,304 @@
+////////////////////////////////////////////////////////////////////////////
+//                           **** WAVPACK ****                            //
+//                  Hybrid Lossless Wavefile Compressor                   //
+//              Copyright (c) 1998 - 2013 Conifer Software.               //
+//                          All Rights Reserved.                          //
+//      Distributed under the BSD Software License (see license.txt)      //
+////////////////////////////////////////////////////////////////////////////
+
+// open_filename.c
+
+// This module provides all the code required to open an existing WavPack
+// file, by filename, for reading. It does not contain the actual code to
+// unpack audio data and this was done so that programs that just want to
+// query WavPack files for information (like, for example, taggers) don't
+// need to link in a lot of unnecessary code.
+//
+// To allow opening files by filename, this code provides an interface
+// between the reader callback mechanism that WavPack uses internally and
+// the standard fstream C library. Note that in applications that do not
+// require opening files by filename, this module can be omitted (which
+// might make building easier).
+//
+// For Unicode support on Windows, a flag has been added (OPEN_FILE_UTF8)
+// that forces the filename string to be assumed UTF-8 and converted to
+// a widechar string suitable for _wfopen(). Without this flag we revert
+// to the previous behavior of simply calling fopen() and hoping that the
+// local character set works. This is ignored on non-Windows platforms
+// (which is okay because they are probably UTF-8 anyway).
+
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <io.h>
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "wavpack_local.h"
+
+#include <fcntl.h>
+#include <sys/stat.h>
+
+#if (defined(__GNUC__) || defined(__sun)) && !defined(_WIN32)
+#include <unistd.h>
+#endif
+
+#ifdef __OS2__
+#include <io.h>
+#endif
+
+#ifdef _WIN32
+#define fileno _fileno
+static FILE *fopen_utf8 (const char *filename_utf8, const char *mode_utf8);
+#if !defined(S_ISREG) && defined(S_IFMT) && defined(S_IFREG)
+#define S_ISREG(m) (((m) & S_IFMT) == S_IFREG)
+#endif
+#endif
+
+#ifdef HAVE_FSEEKO
+#define fseek fseeko
+#define ftell ftello
+#endif
+
+static int32_t read_bytes (void *id, void *data, int32_t bcount)
+{
+    return (int32_t) fread (data, 1, bcount, (FILE*) id);
+}
+
+static int64_t get_pos (void *id)
+{
+#ifdef _WIN32
+    return _ftelli64 ((FILE*) id);
+#else
+    return ftell ((FILE*) id);
+#endif
+}
+
+static int set_pos_abs (void *id, int64_t pos)
+{
+#ifdef _WIN32
+    return _fseeki64 (id, pos, SEEK_SET);
+#else
+    return fseek (id, pos, SEEK_SET);
+#endif
+}
+
+static int set_pos_rel (void *id, int64_t delta, int mode)
+{
+#ifdef _WIN32
+    return _fseeki64 (id, delta, mode);
+#else
+    return fseek (id, delta, mode);
+#endif
+}
+
+static int push_back_byte (void *id, int c)
+{
+    return ungetc (c, id);
+}
+
+#ifdef _WIN32
+
+static int64_t get_length (void *id)
+{
+    LARGE_INTEGER Size;
+    HANDLE        fHandle;
+
+    if (id == NULL)
+        return 0;
+
+    fHandle = (HANDLE)_get_osfhandle(_fileno((FILE*) id));
+    if (fHandle == INVALID_HANDLE_VALUE)
+        return 0;
+
+    Size.u.LowPart = GetFileSize(fHandle, &Size.u.HighPart);
+
+    if (Size.u.LowPart == INVALID_FILE_SIZE && GetLastError() != NO_ERROR)
+        return 0;
+
+    return (int64_t)Size.QuadPart;
+}
+
+#else
+
+static int64_t get_length (void *id)
+{
+    FILE *file = id;
+    struct stat statbuf;
+
+    if (!file || fstat (fileno (file), &statbuf) || !S_ISREG(statbuf.st_mode))
+        return 0;
+
+    return statbuf.st_size;
+}
+
+#endif
+
+static int can_seek (void *id)
+{
+    FILE *file = id;
+    struct stat statbuf;
+
+    return file && !fstat (fileno (file), &statbuf) && S_ISREG(statbuf.st_mode);
+}
+
+static int32_t write_bytes (void *id, void *data, int32_t bcount)
+{
+    return (int32_t) fwrite (data, 1, bcount, (FILE*) id);
+}
+
+#ifdef _WIN32
+
+static int truncate_here (void *id)
+{
+    FILE *file = id;
+    int64_t curr_pos = _ftelli64 (file);
+
+    return _chsize_s (fileno (file), curr_pos);
+}
+
+#else
+
+static int truncate_here (void *id)
+{
+    FILE *file = id;
+    off_t curr_pos = ftell (file);
+
+    return ftruncate (fileno (file), curr_pos);
+}
+
+#endif
+
+static int close_stream (void *id)
+{
+    return fclose ((FILE*) id);
+}
+
+//  int32_t (*read_bytes)(void *id, void *data, int32_t bcount);
+//  int32_t (*write_bytes)(void *id, void *data, int32_t bcount);
+//  int64_t (*get_pos)(void *id);                               // new signature for large files
+//  int (*set_pos_abs)(void *id, int64_t pos);                  // new signature for large files
+//  int (*set_pos_rel)(void *id, int64_t delta, int mode);      // new signature for large files
+//  int (*push_back_byte)(void *id, int c);
+//  int64_t (*get_length)(void *id);                            // new signature for large files
+//  int (*can_seek)(void *id);
+//  int (*truncate_here)(void *id);                             // new function to truncate file at current position
+//  int (*close)(void *id);                                     // new function to close file
+
+static WavpackStreamReader64 freader = {
+    read_bytes, write_bytes, get_pos, set_pos_abs, set_pos_rel,
+    push_back_byte, get_length, can_seek, truncate_here, close_stream
+};
+
+// This function attempts to open the specified WavPack file for reading. If
+// this fails for any reason then an appropriate message is copied to "error"
+// (which must accept 80 characters) and NULL is returned, otherwise a
+// pointer to a WavpackContext structure is returned (which is used to call
+// all other functions in this module). A filename beginning with "-" is
+// assumed to be stdin. The "flags" argument has the following bit mask
+// values to specify details of the open operation:
+
+// OPEN_WVC:  attempt to open/read "correction" file
+// OPEN_TAGS:  attempt to read ID3v1 / APEv2 tags (requires seekable file)
+// OPEN_WRAPPER:  make audio wrapper available (i.e. RIFF) to caller
+// OPEN_2CH_MAX:  open only first stream of multichannel file (usually L/R)
+// OPEN_NORMALIZE:  normalize floating point data to +/- 1.0 (w/ offset exp)
+// OPEN_STREAMING:  blindly unpacks blocks w/o regard to header file position
+// OPEN_EDIT_TAGS:  allow editing of tags (file must be writable)
+// OPEN_FILE_UTF8:  assume infilename is UTF-8 encoded (Windows only)
+
+// Version 4.2 of the WavPack library adds the OPEN_STREAMING flag. This is
+// essentially a "raw" mode where the library will simply decode any blocks
+// fed it through the reader callback, regardless of where those blocks came
+// from in a stream. The only requirement is that complete WavPack blocks are
+// fed to the decoder (and this may require multiple blocks in multichannel
+// mode) and that complete blocks are decoded (even if all samples are not
+// actually required). All the blocks must contain the same number of channels
+// and bit resolution, and the correction data must be either present or not.
+// All other parameters may change from block to block (like lossy/lossless).
+// Obviously, in this mode any seeking must be performed by the application
+// (and again, decoding must start at the beginning of the block containing
+// the seek sample).
+
+WavpackContext *WavpackOpenFileInput (const char *infilename, char *error, int flags, int norm_offset)
+{
+    char *file_mode = (flags & OPEN_EDIT_TAGS) ? "r+b" : "rb";
+    FILE *(*fopen_func)(const char *, const char *) = fopen;
+    FILE *wv_id, *wvc_id;
+
+#ifdef _WIN32
+    if (flags & OPEN_FILE_UTF8)
+        fopen_func = fopen_utf8;
+#endif
+
+    if (*infilename == '-') {
+        wv_id = stdin;
+#if defined(_WIN32)
+        _setmode (fileno (stdin), O_BINARY);
+#endif
+#if defined(__OS2__)
+        setmode (fileno (stdin), O_BINARY);
+#endif
+    }
+    else if ((wv_id = fopen_func (infilename, file_mode)) == NULL) {
+        if (error) strcpy (error, (flags & OPEN_EDIT_TAGS) ? "can't open file for editing" : "can't open file");
+        return NULL;
+    }
+
+    if (wv_id != stdin && (flags & OPEN_WVC)) {
+        char *in2filename = malloc (strlen (infilename) + 10);
+
+        strcpy (in2filename, infilename);
+        strcat (in2filename, "c");
+        wvc_id = fopen_func (in2filename, "rb");
+        free (in2filename);
+    }
+    else
+        wvc_id = NULL;
+
+    return WavpackOpenFileInputEx64 (&freader, wv_id, wvc_id, error, flags, norm_offset);
+}
+
+#ifdef _WIN32
+
+// The following code Copyright (c) 2004-2012 LoRd_MuldeR <mulder2@gmx.de>
+// (see cli/win32_unicode_support.c for full license)
+
+static wchar_t *utf8_to_utf16(const char *input)
+{
+	wchar_t *Buffer;
+	int BuffSize = 0, Result = 0;
+
+	BuffSize = MultiByteToWideChar(CP_UTF8, 0, input, -1, NULL, 0);
+	Buffer = (wchar_t*) malloc(sizeof(wchar_t) * BuffSize);
+	if(Buffer)
+	{
+		Result = MultiByteToWideChar(CP_UTF8, 0, input, -1, Buffer, BuffSize);
+	}
+
+	return ((Result > 0) && (Result <= BuffSize)) ? Buffer : NULL;
+}
+
+
+static FILE *fopen_utf8(const char *filename_utf8, const char *mode_utf8)
+{
+	FILE *ret = NULL;
+	wchar_t *filename_utf16 = utf8_to_utf16(filename_utf8);
+	wchar_t *mode_utf16 = utf8_to_utf16(mode_utf8);
+	
+	if(filename_utf16 && mode_utf16)
+	{
+		ret = _wfopen(filename_utf16, mode_utf16);
+	}
+
+	if(filename_utf16) free(filename_utf16);
+	if(mode_utf16) free(mode_utf16);
+
+	return ret;
+}
+
+#endif
+
+
diff --git a/third_party/wavpack/src/open_legacy.c b/third_party/wavpack/src/open_legacy.c
new file mode 100644
index 0000000..fb61509
--- /dev/null
+++ b/third_party/wavpack/src/open_legacy.c
@@ -0,0 +1,114 @@
+////////////////////////////////////////////////////////////////////////////
+//                           **** WAVPACK ****                            //
+//                  Hybrid Lossless Wavefile Compressor                   //
+//                Copyright (c) 1998 - 2016 David Bryant.                 //
+//                          All Rights Reserved.                          //
+//      Distributed under the BSD Software License (see license.txt)      //
+////////////////////////////////////////////////////////////////////////////
+
+// open_legacy.c
+
+// This code provides an interface between the new reader callback mechanism that
+// WavPack uses internally and the old reader callback functions that did not
+// provide large file support.
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "wavpack_local.h"
+
+typedef struct {
+    WavpackStreamReader *reader;
+    void *id;
+} WavpackReaderTranslator;
+
+static int32_t trans_read_bytes (void *id, void *data, int32_t bcount)
+{
+    WavpackReaderTranslator *trans = (WavpackReaderTranslator *)id;
+    return trans->reader->read_bytes (trans->id, data, bcount);
+}
+
+static int32_t trans_write_bytes (void *id, void *data, int32_t bcount)
+{
+    WavpackReaderTranslator *trans = (WavpackReaderTranslator *)id;
+    return trans->reader->write_bytes (trans->id, data, bcount);
+}
+
+static int64_t trans_get_pos (void *id)
+{
+    WavpackReaderTranslator *trans = (WavpackReaderTranslator *)id;
+    return trans->reader->get_pos (trans->id);
+}
+
+static int trans_set_pos_abs (void *id, int64_t pos)
+{
+    WavpackReaderTranslator *trans = (WavpackReaderTranslator *)id;
+    return trans->reader->set_pos_abs (trans->id, (uint32_t) pos);
+}
+
+static int trans_set_pos_rel (void *id, int64_t delta, int mode)
+{
+    WavpackReaderTranslator *trans = (WavpackReaderTranslator *)id;
+    return trans->reader->set_pos_rel (trans->id, (int32_t) delta, mode);
+}
+
+static int trans_push_back_byte (void *id, int c)
+{
+    WavpackReaderTranslator *trans = (WavpackReaderTranslator *)id;
+    return trans->reader->push_back_byte (trans->id, c);
+}
+
+static int64_t trans_get_length (void *id)
+{
+    WavpackReaderTranslator *trans = (WavpackReaderTranslator *)id;
+    return trans->reader->get_length (trans->id);
+}
+
+static int trans_can_seek (void *id)
+{
+    WavpackReaderTranslator *trans = (WavpackReaderTranslator *)id;
+    return trans->reader->can_seek (trans->id);
+}
+
+static int trans_close_stream (void *id)
+{
+    free (id);
+    return 0;
+}
+
+static WavpackStreamReader64 trans_reader = {
+    trans_read_bytes, trans_write_bytes, trans_get_pos, trans_set_pos_abs, trans_set_pos_rel,
+    trans_push_back_byte, trans_get_length, trans_can_seek, NULL, trans_close_stream
+};
+
+// This function is identical to WavpackOpenFileInput64() except that instead
+// of providing the new 64-bit reader callbacks, the old reader callbacks are
+// utilized and a translation layer is employed. It is provided as a compatibility
+// function for existing applications. To ensure that streaming applications using
+// this function continue to work, the OPEN_NO_CHECKSUM flag is forced on when
+// the OPEN_STREAMING flag is set.
+
+WavpackContext *WavpackOpenFileInputEx (WavpackStreamReader *reader, void *wv_id, void *wvc_id, char *error, int flags, int norm_offset)
+{
+    WavpackReaderTranslator *trans_wv = NULL, *trans_wvc = NULL;
+
+    // this prevents existing streaming applications from failing if they try to pass
+    // in blocks that have been modified from the original (e.g., Matroska blocks)
+
+    if (flags & OPEN_STREAMING)
+        flags |= OPEN_NO_CHECKSUM;
+
+    if (wv_id) {
+        trans_wv = (WavpackReaderTranslator *)malloc (sizeof (WavpackReaderTranslator));
+        trans_wv->reader = reader;
+        trans_wv->id = wv_id;
+    }
+
+    if (wvc_id) {
+        trans_wvc = (WavpackReaderTranslator *)malloc (sizeof (WavpackReaderTranslator));
+        trans_wvc->reader = reader;
+        trans_wvc->id = wvc_id;
+    }
+
+    return WavpackOpenFileInputEx64 (&trans_reader, trans_wv, trans_wvc, error, flags, norm_offset);
+}
diff --git a/third_party/wavpack/src/open_raw.c b/third_party/wavpack/src/open_raw.c
new file mode 100644
index 0000000..6fda6f3
--- /dev/null
+++ b/third_party/wavpack/src/open_raw.c
@@ -0,0 +1,315 @@
+////////////////////////////////////////////////////////////////////////////
+//                           **** WAVPACK ****                            //
+//                  Hybrid Lossless Wavefile Compressor                   //
+//                Copyright (c) 1998 - 2016 David Bryant.                 //
+//                          All Rights Reserved.                          //
+//      Distributed under the BSD Software License (see license.txt)      //
+////////////////////////////////////////////////////////////////////////////
+
+// open_raw.c
+
+// This code provides the ability to decode WavPack frames directly from
+// memory for use in a streaming application. It can handle full blocks
+// or the headerless block data provided by Matroska and the DirectShow
+// WavPack splitter. For information about how Matroska stores WavPack,
+// see: https://www.matroska.org/technical/specs/codecid/wavpack.html
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "wavpack_local.h"
+
+typedef struct {
+    unsigned char *sptr, *dptr, *eptr, free_required;
+} RawSegment;
+
+typedef struct {
+    RawSegment *segments;
+    int num_segments, curr_segment;
+    unsigned char ungetc_char, ungetc_flag;
+} WavpackRawContext;
+
+static int32_t raw_read_bytes (void *id, void *data, int32_t bcount)
+{
+    WavpackRawContext *rcxt = id;
+    unsigned char *outptr = data;
+
+    while (bcount) {
+        if (rcxt->ungetc_flag) {
+            *outptr++ = rcxt->ungetc_char;
+            rcxt->ungetc_flag = 0;
+            bcount--;
+        }
+        else if (rcxt->curr_segment < rcxt->num_segments) {
+            RawSegment *segptr = rcxt->segments + rcxt->curr_segment;
+            int bytes_to_copy = (int)(segptr->eptr - segptr->dptr);
+
+            if (bytes_to_copy > bcount)
+                bytes_to_copy = bcount;
+
+            memcpy (outptr, segptr->dptr, bytes_to_copy);
+            outptr += bytes_to_copy;
+            bcount -= bytes_to_copy;
+
+            if ((segptr->dptr += bytes_to_copy) == segptr->eptr)
+                rcxt->curr_segment++;
+        }
+        else
+            break;
+    }
+
+    return (int32_t)(outptr - (unsigned char *) data);
+}
+
+static int32_t raw_write_bytes (void *id, void *data, int32_t bcount)
+{
+    return 0;
+}
+
+static int64_t raw_get_pos (void *id)
+{
+    return 0;
+}
+
+static int raw_set_pos_abs (void *id, int64_t pos)
+{
+    return 0;
+}
+
+static int raw_set_pos_rel (void *id, int64_t delta, int mode)
+{
+    return 0;
+}
+
+static int raw_push_back_byte (void *id, int c)
+{
+    WavpackRawContext *rcxt = id;
+    rcxt->ungetc_char = c;
+    rcxt->ungetc_flag = 1;
+    return c; 
+}
+
+static int64_t raw_get_length (void *id)
+{
+    return 0;
+}
+
+static int raw_can_seek (void *id)
+{
+    return 0;
+}
+
+static int raw_close_stream (void *id)
+{
+    WavpackRawContext *rcxt = id;
+    int i;
+
+    if (rcxt) {
+        for (i = 0; i < rcxt->num_segments; ++i)
+            if (rcxt->segments [i].sptr && rcxt->segments [i].free_required)
+                free (rcxt->segments [i].sptr);
+
+        if (rcxt->segments) free (rcxt->segments);
+        free (rcxt);
+    }
+
+    return 0;
+}
+
+static WavpackStreamReader64 raw_reader = {
+    raw_read_bytes, raw_write_bytes, raw_get_pos, raw_set_pos_abs, raw_set_pos_rel,
+    raw_push_back_byte, raw_get_length, raw_can_seek, NULL, raw_close_stream
+};
+
+// This function is similar to WavpackOpenFileInput() except that instead of
+// providing a filename to open, the caller provides pointers to buffered
+// WavPack frames (both standard and, optionally, correction data). It
+// decodes only a single frame. Note that in this context, a "frame" is a
+// collection of WavPack blocks that represent all the channels present. In
+// the case of mono or [most] stereo streams, this is the same thing, but
+// for multichannel streams each frame consists of several WavPack blocks
+// (which can contain only 1 or 2 channels).
+
+WavpackContext *WavpackOpenRawDecoder (
+    void *main_data, int32_t main_size,
+    void *corr_data, int32_t corr_size,
+    int16_t version, char *error, int flags, int norm_offset)
+{
+    WavpackRawContext *raw_wv = NULL, *raw_wvc = NULL;
+
+    // if the WavPack data does not contain headers we assume Matroska-style storage
+    // and recreate the missing headers
+
+    if (strncmp (main_data, "wvpk", 4)) {
+        uint32_t multiple_blocks = 0, block_size, block_samples = 0, wphdr_flags, crc;
+        uint32_t main_bytes = main_size, corr_bytes = corr_size;
+        unsigned char *mcp = main_data;
+        unsigned char *ccp = corr_data;
+        int msi = 0, csi = 0;
+
+        raw_wv = malloc (sizeof (WavpackRawContext));
+        memset (raw_wv, 0, sizeof (WavpackRawContext));
+
+        if (corr_data && corr_size) {
+            raw_wvc = malloc (sizeof (WavpackRawContext));
+            memset (raw_wvc, 0, sizeof (WavpackRawContext));
+        }
+
+        while (main_bytes >= 12) {
+            WavpackHeader *wphdr = malloc (sizeof (WavpackHeader));
+
+            if (!msi) {
+                block_samples = *mcp++;
+                block_samples += *mcp++ << 8;
+                block_samples += *mcp++ << 16;
+                block_samples += *mcp++ << 24;
+                main_bytes -= 4;
+            }
+
+            wphdr_flags = *mcp++;
+            wphdr_flags += *mcp++ << 8;
+            wphdr_flags += *mcp++ << 16;
+            wphdr_flags += *mcp++ << 24;
+            main_bytes -= 4;
+
+            // if the first block does not have the FINAL_BLOCK flag set,
+            // then there are multiple blocks
+
+            if (!msi && !(wphdr_flags & FINAL_BLOCK))
+                multiple_blocks = 1;
+
+            crc = *mcp++;
+            crc += *mcp++ << 8;
+            crc += *mcp++ << 16;
+            crc += *mcp++ << 24;
+            main_bytes -= 4;
+
+            if (multiple_blocks) {
+                block_size = *mcp++;
+                block_size += *mcp++ << 8;
+                block_size += *mcp++ << 16;
+                block_size += *mcp++ << 24;
+                main_bytes -= 4;
+            }
+            else
+                block_size = main_bytes;
+
+            if (block_size > main_bytes) {
+                if (error) strcpy (error, "main block overran available data!");
+                raw_close_stream (raw_wv);
+                raw_close_stream (raw_wvc);
+                return NULL;
+            } 
+
+            memset (wphdr, 0, sizeof (WavpackHeader));
+            memcpy (wphdr->ckID, "wvpk", 4);
+            wphdr->ckSize = sizeof (WavpackHeader) - 8 + block_size;
+            SET_TOTAL_SAMPLES (*wphdr, block_samples);
+            wphdr->block_samples = block_samples;
+            wphdr->version = version;
+            wphdr->flags = wphdr_flags;
+            wphdr->crc = crc;
+            WavpackLittleEndianToNative (wphdr, WavpackHeaderFormat);
+
+            raw_wv->num_segments += 2;
+            raw_wv->segments = realloc (raw_wv->segments, sizeof (RawSegment) * raw_wv->num_segments);
+            raw_wv->segments [msi].dptr = raw_wv->segments [msi].sptr = (unsigned char *) wphdr;
+            raw_wv->segments [msi].eptr = raw_wv->segments [msi].dptr + sizeof (WavpackHeader);
+            raw_wv->segments [msi++].free_required = 1;
+            raw_wv->segments [msi].dptr = raw_wv->segments [msi].sptr = mcp;
+            raw_wv->segments [msi].eptr = raw_wv->segments [msi].dptr + block_size;
+            raw_wv->segments [msi++].free_required = 0;
+            main_bytes -= block_size;
+            mcp += block_size;
+
+            if (corr_data && corr_bytes >= 4) {
+                wphdr = malloc (sizeof (WavpackHeader));
+
+                crc = *ccp++;
+                crc += *ccp++ << 8;
+                crc += *ccp++ << 16;
+                crc += *ccp++ << 24;
+                corr_bytes -= 4;
+
+                if (multiple_blocks) {
+                    block_size = *ccp++;
+                    block_size += *ccp++ << 8;
+                    block_size += *ccp++ << 16;
+                    block_size += *ccp++ << 24;
+                    corr_bytes -= 4;
+                }
+                else
+                    block_size = corr_bytes;
+
+                if (block_size > corr_bytes) {
+                    if (error) strcpy (error, "correction block overran available data!");
+                    raw_close_stream (raw_wv);
+                    raw_close_stream (raw_wvc);
+                    return NULL;
+                } 
+
+                memset (wphdr, 0, sizeof (WavpackHeader));
+                memcpy (wphdr->ckID, "wvpk", 4);
+                wphdr->ckSize = sizeof (WavpackHeader) - 8 + block_size;
+                SET_TOTAL_SAMPLES (*wphdr, block_samples);
+                wphdr->block_samples = block_samples;
+                wphdr->version = version;
+                wphdr->flags = wphdr_flags;
+                wphdr->crc = crc;
+                WavpackLittleEndianToNative (wphdr, WavpackHeaderFormat);
+
+                raw_wvc->num_segments += 2;
+                raw_wvc->segments = realloc (raw_wvc->segments, sizeof (RawSegment) * raw_wvc->num_segments);
+                raw_wvc->segments [csi].dptr = raw_wvc->segments [csi].sptr = (unsigned char *) wphdr;
+                raw_wvc->segments [csi].eptr = raw_wvc->segments [csi].dptr + sizeof (WavpackHeader);
+                raw_wvc->segments [csi++].free_required = 1;
+                raw_wvc->segments [csi].dptr = raw_wvc->segments [csi].sptr = ccp;
+                raw_wvc->segments [csi].eptr = raw_wvc->segments [csi].dptr + block_size;
+                raw_wvc->segments [csi++].free_required = 0;
+                corr_bytes -= block_size;
+                ccp += block_size;
+            }
+        }
+
+        if (main_bytes || (corr_data && corr_bytes)) {
+            if (error) strcpy (error, "leftover multiblock data!");
+            raw_close_stream (raw_wv);
+            raw_close_stream (raw_wvc);
+            return NULL;
+        }
+    }
+    else {      // the case of WavPack blocks with headers is much easier...
+        if (main_data) {
+            raw_wv = malloc (sizeof (WavpackRawContext));
+            memset (raw_wv, 0, sizeof (WavpackRawContext));
+            raw_wv->num_segments = 1;
+            raw_wv->segments = malloc (sizeof (RawSegment) * raw_wv->num_segments);
+            raw_wv->segments [0].dptr = raw_wv->segments [0].sptr = main_data;
+            raw_wv->segments [0].eptr = raw_wv->segments [0].dptr + main_size;
+            raw_wv->segments [0].free_required = 0;
+        }
+
+        if (corr_data && corr_size) {
+            raw_wvc = malloc (sizeof (WavpackRawContext));
+            memset (raw_wvc, 0, sizeof (WavpackRawContext));
+            raw_wvc->num_segments = 1;
+            raw_wvc->segments = malloc (sizeof (RawSegment) * raw_wvc->num_segments);
+            raw_wvc->segments [0].dptr = raw_wvc->segments [0].sptr = corr_data;
+            raw_wvc->segments [0].eptr = raw_wvc->segments [0].dptr + corr_size;
+            raw_wvc->segments [0].free_required = 0;
+        }
+    }
+
+    return WavpackOpenFileInputEx64 (&raw_reader, raw_wv, raw_wvc, error, flags | OPEN_STREAMING | OPEN_NO_CHECKSUM, norm_offset);
+}
+
+// Return the number of samples represented by the current (and in the raw case, only) frame.
+
+uint32_t WavpackGetNumSamplesInFrame (WavpackContext *wpc)
+{
+    if (wpc && wpc->streams && wpc->streams [0])
+        return wpc->streams [0]->wphdr.block_samples;
+    else
+        return -1;
+}
+
diff --git a/third_party/wavpack/src/open_utils.c b/third_party/wavpack/src/open_utils.c
new file mode 100644
index 0000000..c880d34
--- /dev/null
+++ b/third_party/wavpack/src/open_utils.c
@@ -0,0 +1,1279 @@
+////////////////////////////////////////////////////////////////////////////
+//                           **** WAVPACK ****                            //
+//                  Hybrid Lossless Wavefile Compressor                   //
+//                Copyright (c) 1998 - 2016 David Bryant.                 //
+//                          All Rights Reserved.                          //
+//      Distributed under the BSD Software License (see license.txt)      //
+////////////////////////////////////////////////////////////////////////////
+
+// open_utils.c
+
+// This module provides all the code required to open an existing WavPack file
+// for reading by using a reader callback mechanism (NOT a filename). This
+// includes the code required to find and parse WavPack blocks, process any
+// included metadata, and queue up the bitstreams containing the encoded audio
+// data. It does not the actual code to unpack audio data and this was done so
+// that programs that just want to query WavPack files for information (like,
+// for example, taggers) don't need to link in a lot of unnecessary code.
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "wavpack_local.h"
+
+// This function is identical to WavpackOpenFileInput() except that instead
+// of providing a filename to open, the caller provides a pointer to a set of
+// reader callbacks and instances of up to two streams. The first of these
+// streams is required and contains the regular WavPack data stream; the second
+// contains the "correction" file if desired. Unlike the standard open
+// function which handles the correction file transparently, in this case it
+// is the responsibility of the caller to be aware of correction files.
+
+static int seek_eof_information (WavpackContext *wpc, int64_t *final_index, int get_wrapper);
+
+WavpackContext *WavpackOpenFileInputEx64 (WavpackStreamReader64 *reader, void *wv_id, void *wvc_id, char *error, int flags, int norm_offset)
+{
+    WavpackContext *wpc = (WavpackContext *)malloc (sizeof (WavpackContext));
+    WavpackStream *wps;
+    int num_blocks = 0;
+    unsigned char first_byte;
+    uint32_t bcount;
+
+    if (!wpc) {
+        if (error) strcpy (error, "can't allocate memory");
+        return NULL;
+    }
+
+    CLEAR (*wpc);
+    wpc->wv_in = wv_id;
+    wpc->wvc_in = wvc_id;
+    wpc->reader = reader;
+    wpc->total_samples = -1;
+    wpc->norm_offset = norm_offset;
+    wpc->max_streams = OLD_MAX_STREAMS;     // use this until overwritten with actual number
+    wpc->open_flags = flags;
+
+    wpc->filelen = wpc->reader->get_length (wpc->wv_in);
+
+#ifndef NO_TAGS
+    if ((flags & (OPEN_TAGS | OPEN_EDIT_TAGS)) && wpc->reader->can_seek (wpc->wv_in)) {
+        load_tag (wpc);
+        wpc->reader->set_pos_abs (wpc->wv_in, 0);
+
+        if ((flags & OPEN_EDIT_TAGS) && !editable_tag (&wpc->m_tag)) {
+            if (error) strcpy (error, "can't edit tags located at the beginning of files!");
+            return WavpackCloseFile (wpc);
+        }
+    }
+#endif
+
+    if (wpc->reader->read_bytes (wpc->wv_in, &first_byte, 1) != 1) {
+        if (error) strcpy (error, "can't read all of WavPack file!");
+        return WavpackCloseFile (wpc);
+    }
+
+    wpc->reader->push_back_byte (wpc->wv_in, first_byte);
+
+    if (first_byte == 'R') {
+#ifdef ENABLE_LEGACY
+        return open_file3 (wpc, error);
+#else
+        if (error) strcpy (error, "this legacy WavPack file is deprecated, use version 4.80.0 to transcode");
+        return WavpackCloseFile (wpc);
+#endif
+    }
+
+    wpc->streams = (WavpackStream **)(malloc ((wpc->num_streams = 1) * sizeof (wpc->streams [0])));
+    if (!wpc->streams) {
+        if (error) strcpy (error, "can't allocate memory");
+        return WavpackCloseFile (wpc);
+    }
+
+    wpc->streams [0] = wps = (WavpackStream *)malloc (sizeof (WavpackStream));
+    if (!wps) {
+        if (error) strcpy (error, "can't allocate memory");
+        return WavpackCloseFile (wpc);
+    }
+    CLEAR (*wps);
+
+    while (!wps->wphdr.block_samples) {
+
+        wpc->filepos = wpc->reader->get_pos (wpc->wv_in);
+        bcount = read_next_header (wpc->reader, wpc->wv_in, &wps->wphdr);
+
+        if (bcount == (uint32_t) -1 ||
+            (!wps->wphdr.block_samples && num_blocks++ > 16)) {
+                if (error) strcpy (error, "not compatible with this version of WavPack file!");
+                return WavpackCloseFile (wpc);
+        }
+
+        wpc->filepos += bcount;
+        wps->blockbuff = (unsigned char *)malloc (wps->wphdr.ckSize + 8);
+        if (!wps->blockbuff) {
+            if (error) strcpy (error, "can't allocate memory");
+            return WavpackCloseFile (wpc);
+        }
+        memcpy (wps->blockbuff, &wps->wphdr, 32);
+
+        if (wpc->reader->read_bytes (wpc->wv_in, wps->blockbuff + 32, wps->wphdr.ckSize - 24) != wps->wphdr.ckSize - 24) {
+            if (error) strcpy (error, "can't read all of WavPack file!");
+            return WavpackCloseFile (wpc);
+        }
+
+        // if block does not verify, flag error, free buffer, and continue
+        if (!WavpackVerifySingleBlock (wps->blockbuff, !(flags & OPEN_NO_CHECKSUM))) {
+            wps->wphdr.block_samples = 0;
+            free (wps->blockbuff);
+            wps->blockbuff = NULL;
+            wpc->crc_errors++;
+            continue;
+        }
+
+        wps->init_done = FALSE;
+
+        if (wps->wphdr.block_samples) {
+            if (flags & OPEN_STREAMING)
+                SET_BLOCK_INDEX (wps->wphdr, 0);
+            else if (wpc->total_samples == -1) {
+                if (GET_BLOCK_INDEX (wps->wphdr) || GET_TOTAL_SAMPLES (wps->wphdr) == -1) {
+                    wpc->initial_index = GET_BLOCK_INDEX (wps->wphdr);
+                    SET_BLOCK_INDEX (wps->wphdr, 0);
+
+                    if (wpc->reader->can_seek (wpc->wv_in)) {
+                        int64_t final_index = -1;
+
+                        seek_eof_information (wpc, &final_index, FALSE);
+
+                        if (final_index != -1)
+                            wpc->total_samples = final_index - wpc->initial_index;
+                    }
+                }
+                else
+                    wpc->total_samples = GET_TOTAL_SAMPLES (wps->wphdr);
+            }
+        }
+        else if (wpc->total_samples == -1 && !GET_BLOCK_INDEX (wps->wphdr) && GET_TOTAL_SAMPLES (wps->wphdr))
+            wpc->total_samples = GET_TOTAL_SAMPLES (wps->wphdr);
+
+        if (wpc->wvc_in && wps->wphdr.block_samples && (wps->wphdr.flags & HYBRID_FLAG)) {
+            unsigned char ch;
+
+            if (wpc->reader->read_bytes (wpc->wvc_in, &ch, 1) == 1) {
+                wpc->reader->push_back_byte (wpc->wvc_in, ch);
+                wpc->file2len = wpc->reader->get_length (wpc->wvc_in);
+                wpc->wvc_flag = TRUE;
+            }
+        }
+
+        if (wpc->wvc_flag && !read_wvc_block (wpc)) {
+            if (error) strcpy (error, "not compatible with this version of correction file!");
+            return WavpackCloseFile (wpc);
+        }
+
+        if (!wps->init_done && !unpack_init (wpc)) {
+            if (error) strcpy (error, wpc->error_message [0] ? wpc->error_message :
+                "not compatible with this version of WavPack file!");
+
+            return WavpackCloseFile (wpc);
+        }
+
+        wps->init_done = TRUE;
+    }
+
+    wpc->config.flags &= ~0xff;
+    wpc->config.flags |= wps->wphdr.flags & 0xff;
+
+    if (!wpc->config.num_channels) {
+        wpc->config.num_channels = (wps->wphdr.flags & MONO_FLAG) ? 1 : 2;
+        wpc->config.channel_mask = 0x5 - wpc->config.num_channels;
+    }
+
+    if ((flags & OPEN_2CH_MAX) && !(wps->wphdr.flags & FINAL_BLOCK))
+        wpc->reduced_channels = (wps->wphdr.flags & MONO_FLAG) ? 1 : 2;
+
+    if (wps->wphdr.flags & DSD_FLAG) {
+#ifdef ENABLE_DSD
+        if (flags & OPEN_DSD_NATIVE) {
+            wpc->config.bytes_per_sample = 1;
+            wpc->config.bits_per_sample = 8;
+        }
+        else if (flags & OPEN_DSD_AS_PCM) {
+            wpc->decimation_context = decimate_dsd_init (wpc->reduced_channels ?
+                wpc->reduced_channels : wpc->config.num_channels);
+
+            wpc->config.bytes_per_sample = 3;
+            wpc->config.bits_per_sample = 24;
+        }
+        else {
+            if (error) strcpy (error, "not configured to handle DSD WavPack files!");
+            return WavpackCloseFile (wpc);
+        }
+#else
+        if (error) strcpy (error, "not configured to handle DSD WavPack files!");
+        return WavpackCloseFile (wpc);
+#endif
+    }
+    else {
+        wpc->config.bytes_per_sample = (wps->wphdr.flags & BYTES_STORED) + 1;
+        wpc->config.float_norm_exp = wps->float_norm_exp;
+
+        wpc->config.bits_per_sample = (wpc->config.bytes_per_sample * 8) -
+            ((wps->wphdr.flags & SHIFT_MASK) >> SHIFT_LSB);
+    }
+
+    if (!wpc->config.sample_rate) {
+        if (!wps->wphdr.block_samples || (wps->wphdr.flags & SRATE_MASK) == SRATE_MASK)
+            wpc->config.sample_rate = 44100;
+        else
+            wpc->config.sample_rate = sample_rates [(wps->wphdr.flags & SRATE_MASK) >> SRATE_LSB];
+    }
+
+    return wpc;
+}
+
+// This function returns the major version number of the WavPack program
+// (or library) that created the open file. Currently, this can be 1 to 5.
+// Minor versions are not recorded in WavPack files.
+
+int WavpackGetVersion (WavpackContext *wpc)
+{
+    if (wpc) {
+#ifdef ENABLE_LEGACY
+        if (wpc->stream3)
+            return get_version3 (wpc);
+#endif
+        return wpc->version_five ? 5 : 4;
+    }
+
+    return 0;
+}
+
+// Return the file format specified in the call to WavpackSetFileInformation()
+// when the file was created. For all files created prior to WavPack 5.0 this
+// will 0 (WP_FORMAT_WAV).
+
+unsigned char WavpackGetFileFormat (WavpackContext *wpc)
+{
+    return wpc->file_format;
+}
+
+// Return a string representing the recommended file extension for the open
+// WavPack file. For all files created prior to WavPack 5.0 this will be "wav",
+// even for raw files with no RIFF into. This string is specified in the
+// call to WavpackSetFileInformation() when the file was created.
+
+char *WavpackGetFileExtension (WavpackContext *wpc)
+{
+    if (wpc && wpc->file_extension [0])
+        return wpc->file_extension;
+    else
+        return "wav";
+}
+
+// This function initializes everything required to unpack a WavPack block
+// and must be called before unpack_samples() is called to obtain audio data.
+// It is assumed that the WavpackHeader has been read into the wps->wphdr
+// (in the current WavpackStream) and that the entire block has been read at
+// wps->blockbuff. If a correction file is available (wpc->wvc_flag = TRUE)
+// then the corresponding correction block must be read into wps->block2buff
+// and its WavpackHeader has overwritten the header at wps->wphdr. This is
+// where all the metadata blocks are scanned including those that contain
+// bitstream data.
+
+static int read_metadata_buff (WavpackMetadata *wpmd, unsigned char *blockbuff, unsigned char **buffptr);
+static int process_metadata (WavpackContext *wpc, WavpackMetadata *wpmd);
+static void bs_open_read (Bitstream *bs, void *buffer_start, void *buffer_end);
+
+int unpack_init (WavpackContext *wpc)
+{
+    WavpackStream *wps = wpc->streams [wpc->current_stream];
+    unsigned char *blockptr, *block2ptr;
+    WavpackMetadata wpmd;
+
+    wps->num_terms = 0;
+    wps->mute_error = FALSE;
+    wps->crc = wps->crc_x = 0xffffffff;
+    wps->dsd.ready = 0;
+    CLEAR (wps->wvbits);
+    CLEAR (wps->wvcbits);
+    CLEAR (wps->wvxbits);
+    CLEAR (wps->decorr_passes);
+    CLEAR (wps->dc);
+    CLEAR (wps->w);
+
+    if (!(wps->wphdr.flags & MONO_FLAG) && wpc->config.num_channels && wps->wphdr.block_samples &&
+        (wpc->reduced_channels == 1 || wpc->config.num_channels == 1)) {
+            wps->mute_error = TRUE;
+            return FALSE;
+    }
+
+    if ((wps->wphdr.flags & UNKNOWN_FLAGS) || (wps->wphdr.flags & MONO_DATA) == MONO_DATA) {
+        wps->mute_error = TRUE;
+        return FALSE;
+    }
+
+    blockptr = wps->blockbuff + sizeof (WavpackHeader);
+
+    while (read_metadata_buff (&wpmd, wps->blockbuff, &blockptr))
+        if (!process_metadata (wpc, &wpmd)) {
+            wps->mute_error = TRUE;
+            return FALSE;
+        }
+
+    if (wps->wphdr.block_samples && wpc->wvc_flag && wps->block2buff) {
+        block2ptr = wps->block2buff + sizeof (WavpackHeader);
+
+        while (read_metadata_buff (&wpmd, wps->block2buff, &block2ptr))
+            if (!process_metadata (wpc, &wpmd)) {
+                wps->mute_error = TRUE;
+                return FALSE;
+            }
+    }
+
+    if (wps->wphdr.block_samples && ((wps->wphdr.flags & DSD_FLAG) ? !wps->dsd.ready : !bs_is_open (&wps->wvbits))) {
+        if (bs_is_open (&wps->wvcbits))
+            strcpy (wpc->error_message, "can't unpack correction files alone!");
+
+        wps->mute_error = TRUE;
+        return FALSE;
+    }
+
+    if (wps->wphdr.block_samples && !bs_is_open (&wps->wvxbits)) {
+        if ((wps->wphdr.flags & INT32_DATA) && wps->int32_sent_bits)
+            wpc->lossy_blocks = TRUE;
+
+        if ((wps->wphdr.flags & FLOAT_DATA) &&
+            wps->float_flags & (FLOAT_EXCEPTIONS | FLOAT_ZEROS_SENT | FLOAT_SHIFT_SENT | FLOAT_SHIFT_SAME))
+                wpc->lossy_blocks = TRUE;
+    }
+
+    if (wps->wphdr.block_samples)
+        wps->sample_index = GET_BLOCK_INDEX (wps->wphdr);
+
+    return TRUE;
+}
+
+//////////////////////////////// matadata handlers ///////////////////////////////
+
+// These functions handle specific metadata types and are called directly
+// during WavPack block parsing by process_metadata() at the bottom.
+
+// This function initialzes the main bitstream for audio samples, which must
+// be in the "wv" file.
+
+static int init_wv_bitstream (WavpackStream *wps, WavpackMetadata *wpmd)
+{
+    if (!wpmd->byte_length || (wpmd->byte_length & 1))
+        return FALSE;
+
+    bs_open_read (&wps->wvbits, wpmd->data, (unsigned char *) wpmd->data + wpmd->byte_length);
+    return TRUE;
+}
+
+// This function initialzes the "correction" bitstream for audio samples,
+// which currently must be in the "wvc" file.
+
+static int init_wvc_bitstream (WavpackStream *wps, WavpackMetadata *wpmd)
+{
+    if (!wpmd->byte_length || (wpmd->byte_length & 1))
+        return FALSE;
+
+    bs_open_read (&wps->wvcbits, wpmd->data, (unsigned char *) wpmd->data + wpmd->byte_length);
+    return TRUE;
+}
+
+// This function initialzes the "extra" bitstream for audio samples which
+// contains the information required to losslessly decompress 32-bit float data
+// or integer data that exceeds 24 bits. This bitstream is in the "wv" file
+// for pure lossless data or the "wvc" file for hybrid lossless. This data
+// would not be used for hybrid lossy mode. There is also a 32-bit CRC stored
+// in the first 4 bytes of these blocks.
+
+static int init_wvx_bitstream (WavpackStream *wps, WavpackMetadata *wpmd)
+{
+    unsigned char *cp = (unsigned char *)wpmd->data;
+
+    if (wpmd->byte_length <= 4 || (wpmd->byte_length & 1))
+        return FALSE;
+
+    wps->crc_wvx = *cp++;
+    wps->crc_wvx |= (int32_t) *cp++ << 8;
+    wps->crc_wvx |= (int32_t) *cp++ << 16;
+    wps->crc_wvx |= (int32_t) *cp++ << 24;
+
+    bs_open_read (&wps->wvxbits, cp, (unsigned char *) wpmd->data + wpmd->byte_length);
+    return TRUE;
+}
+
+// Read the int32 data from the specified metadata into the specified stream.
+// This data is used for integer data that has more than 24 bits of magnitude
+// or, in some cases, used to eliminate redundant bits from any audio stream.
+
+static int read_int32_info (WavpackStream *wps, WavpackMetadata *wpmd)
+{
+    int bytecnt = wpmd->byte_length;
+    char *byteptr = (char *)wpmd->data;
+
+    if (bytecnt != 4)
+        return FALSE;
+
+    wps->int32_sent_bits = *byteptr++;
+    wps->int32_zeros = *byteptr++;
+    wps->int32_ones = *byteptr++;
+    wps->int32_dups = *byteptr;
+
+    return TRUE;
+}
+
+static int read_float_info (WavpackStream *wps, WavpackMetadata *wpmd)
+{
+    int bytecnt = wpmd->byte_length;
+    char *byteptr = (char *)wpmd->data;
+
+    if (bytecnt != 4)
+        return FALSE;
+
+    wps->float_flags = *byteptr++;
+    wps->float_shift = *byteptr++;
+    wps->float_max_exp = *byteptr++;
+    wps->float_norm_exp = *byteptr;
+    return TRUE;
+}
+
+// Read multichannel information from metadata. The first byte is the total
+// number of channels and the following bytes represent the channel_mask
+// as described for Microsoft WAVEFORMATEX.
+
+static int read_channel_info (WavpackContext *wpc, WavpackMetadata *wpmd)
+{
+    int bytecnt = wpmd->byte_length, shift = 0, mask_bits;
+    unsigned char *byteptr = (unsigned char *)wpmd->data;
+    uint32_t mask = 0;
+
+    if (!bytecnt || bytecnt > 7)
+        return FALSE;
+
+    if (!wpc->config.num_channels) {
+
+        // if bytecnt is 6 or 7 we are using new configuration with "unlimited" streams
+
+        if (bytecnt >= 6) {
+            wpc->config.num_channels = (byteptr [0] | ((byteptr [2] & 0xf) << 8)) + 1;
+            wpc->max_streams = (byteptr [1] | ((byteptr [2] & 0xf0) << 4)) + 1;
+
+            if (wpc->config.num_channels < wpc->max_streams)
+                return FALSE;
+    
+            byteptr += 3;
+            mask = *byteptr++;
+            mask |= (uint32_t) *byteptr++ << 8;
+            mask |= (uint32_t) *byteptr++ << 16;
+
+            if (bytecnt == 7)                           // this was introduced in 5.0
+                mask |= (uint32_t) *byteptr << 24;
+        }
+        else {
+            wpc->config.num_channels = *byteptr++;
+
+            while (--bytecnt) {
+                mask |= (uint32_t) *byteptr++ << shift;
+                shift += 8;
+            }
+        }
+
+        if (wpc->config.num_channels > wpc->max_streams * 2)
+            return FALSE;
+
+        wpc->config.channel_mask = mask;
+
+        for (mask_bits = 0; mask; mask >>= 1)
+            if ((mask & 1) && ++mask_bits > wpc->config.num_channels)
+                return FALSE;
+    }
+
+    return TRUE;
+}
+
+// Read multichannel identity information from metadata. Data is an array of
+// unsigned characters representing any channels in the file that DO NOT
+// match one the 18 Microsoft standard channels (and are represented in the
+// channel mask). A value of 0 is not allowed and 0xff means an unknown or
+// undefined channel identity.
+
+static int read_channel_identities (WavpackContext *wpc, WavpackMetadata *wpmd)
+{
+    if (!wpc->channel_identities) {
+        wpc->channel_identities = (unsigned char *)malloc (wpmd->byte_length + 1);
+        memcpy (wpc->channel_identities, wpmd->data, wpmd->byte_length);
+        wpc->channel_identities [wpmd->byte_length] = 0;
+    }
+
+    return TRUE;
+}
+
+// Read configuration information from metadata.
+
+static int read_config_info (WavpackContext *wpc, WavpackMetadata *wpmd)
+{
+    int bytecnt = wpmd->byte_length;
+    unsigned char *byteptr = (unsigned char *)wpmd->data;
+
+    if (bytecnt >= 3) {
+        wpc->config.flags &= 0xff;
+        wpc->config.flags |= (int32_t) *byteptr++ << 8;
+        wpc->config.flags |= (int32_t) *byteptr++ << 16;
+        wpc->config.flags |= (int32_t) *byteptr++ << 24;
+        bytecnt -= 3;
+
+        if (bytecnt && (wpc->config.flags & CONFIG_EXTRA_MODE)) {
+            wpc->config.xmode = *byteptr++;
+            bytecnt--;
+        }
+
+        // we used an extra config byte here for the 5.0.0 alpha, so still
+        // honor it now (but this has been replaced with NEW_CONFIG)
+
+        if (bytecnt) {
+            wpc->config.qmode = (wpc->config.qmode & ~0xff) | *byteptr;
+            wpc->version_five = 1;
+        }
+    }
+
+    return TRUE;
+}
+
+// Read "new" configuration information from metadata.
+
+static int read_new_config_info (WavpackContext *wpc, WavpackMetadata *wpmd)
+{
+    int bytecnt = wpmd->byte_length;
+    unsigned char *byteptr = (unsigned char *)wpmd->data;
+
+    wpc->version_five = 1;      // just having this block signals version 5.0
+
+    wpc->file_format = wpc->config.qmode = wpc->channel_layout = 0;
+
+    if (wpc->channel_reordering) {
+        free (wpc->channel_reordering);
+        wpc->channel_reordering = NULL;
+    }
+
+    // if there's any data, the first two bytes are file_format and qmode flags
+
+    if (bytecnt >= 2) {
+        wpc->file_format = *byteptr++;
+        wpc->config.qmode = (wpc->config.qmode & ~0xff) | *byteptr++;
+        bytecnt -= 2;
+
+        // another byte indicates a channel layout
+
+        if (bytecnt) {
+            int nchans, i;
+
+            wpc->channel_layout = (int32_t) *byteptr++ << 16;
+            bytecnt--;
+
+            // another byte means we have a channel count for the layout and maybe a reordering
+
+            if (bytecnt) {
+                wpc->channel_layout += nchans = *byteptr++;
+                bytecnt--;
+
+                // any more means there's a reordering string
+
+                if (bytecnt) {
+                    if (bytecnt > nchans)
+                        return FALSE;
+
+                    wpc->channel_reordering = (unsigned char *)malloc (nchans);
+
+                    // note that redundant reordering info is not stored, so we fill in the rest
+
+                    if (wpc->channel_reordering) {
+                        for (i = 0; i < nchans; ++i)
+                            if (bytecnt) {
+                                wpc->channel_reordering [i] = *byteptr++;
+
+                                if (wpc->channel_reordering [i] >= nchans)  // make sure index is in range
+                                    wpc->channel_reordering [i] = 0;
+
+                                bytecnt--;
+                            }
+                            else
+                                wpc->channel_reordering [i] = i;
+                    }
+                }
+            }
+            else
+                wpc->channel_layout += wpc->config.num_channels;
+        }
+    }
+
+    return TRUE;
+}
+
+// Read non-standard sampling rate from metadata.
+
+static int read_sample_rate (WavpackContext *wpc, WavpackMetadata *wpmd)
+{
+    int bytecnt = wpmd->byte_length;
+    unsigned char *byteptr = (unsigned char *)wpmd->data;
+
+    if (bytecnt == 3 || bytecnt == 4) {
+        wpc->config.sample_rate = (int32_t) *byteptr++;
+        wpc->config.sample_rate |= (int32_t) *byteptr++ << 8;
+        wpc->config.sample_rate |= (int32_t) *byteptr++ << 16;
+
+        // for sampling rates > 16777215 (non-audio probably, or ...)
+
+        if (bytecnt == 4)
+            wpc->config.sample_rate |= (int32_t) (*byteptr & 0x7f) << 24;
+    }
+
+    return TRUE;
+}
+
+// Read wrapper data from metadata. Currently, this consists of the RIFF
+// header and trailer that wav files contain around the audio data but could
+// be used for other formats as well. Because WavPack files contain all the
+// information required for decoding and playback, this data can probably
+// be ignored except when an exact wavefile restoration is needed.
+
+static int read_wrapper_data (WavpackContext *wpc, WavpackMetadata *wpmd)
+{
+    if ((wpc->open_flags & OPEN_WRAPPER) && wpc->wrapper_bytes < MAX_WRAPPER_BYTES && wpmd->byte_length) {
+        wpc->wrapper_data = (unsigned char *)realloc (wpc->wrapper_data, wpc->wrapper_bytes + wpmd->byte_length);
+	if (!wpc->wrapper_data)
+	    return FALSE;
+        memcpy (wpc->wrapper_data + wpc->wrapper_bytes, wpmd->data, wpmd->byte_length);
+        wpc->wrapper_bytes += wpmd->byte_length;
+    }
+
+    return TRUE;
+}
+
+static int read_metadata_buff (WavpackMetadata *wpmd, unsigned char *blockbuff, unsigned char **buffptr)
+{
+    WavpackHeader *wphdr = (WavpackHeader *) blockbuff;
+    unsigned char *buffend = blockbuff + wphdr->ckSize + 8;
+
+    if (buffend - *buffptr < 2)
+        return FALSE;
+
+    wpmd->id = *(*buffptr)++;
+    wpmd->byte_length = *(*buffptr)++ << 1;
+
+    if (wpmd->id & ID_LARGE) {
+        wpmd->id &= ~ID_LARGE;
+
+        if (buffend - *buffptr < 2)
+            return FALSE;
+
+        wpmd->byte_length += *(*buffptr)++ << 9;
+        wpmd->byte_length += *(*buffptr)++ << 17;
+    }
+
+    if (wpmd->id & ID_ODD_SIZE) {
+        if (!wpmd->byte_length)         // odd size and zero length makes no sense
+            return FALSE;
+        wpmd->id &= ~ID_ODD_SIZE;
+        wpmd->byte_length--;
+    }
+
+    if (wpmd->byte_length) {
+        if (buffend - *buffptr < wpmd->byte_length + (wpmd->byte_length & 1)) {
+            wpmd->data = NULL;
+            return FALSE;
+        }
+
+        wpmd->data = *buffptr;
+        (*buffptr) += wpmd->byte_length + (wpmd->byte_length & 1);
+    }
+    else
+        wpmd->data = NULL;
+
+    return TRUE;
+}
+
+static int process_metadata (WavpackContext *wpc, WavpackMetadata *wpmd)
+{
+    WavpackStream *wps = wpc->streams [wpc->current_stream];
+
+    switch (wpmd->id) {
+        case ID_DUMMY:
+            return TRUE;
+
+        case ID_DECORR_TERMS:
+            return read_decorr_terms (wps, wpmd);
+
+        case ID_DECORR_WEIGHTS:
+            return read_decorr_weights (wps, wpmd);
+
+        case ID_DECORR_SAMPLES:
+            return read_decorr_samples (wps, wpmd);
+
+        case ID_ENTROPY_VARS:
+            return read_entropy_vars (wps, wpmd);
+
+        case ID_HYBRID_PROFILE:
+            return read_hybrid_profile (wps, wpmd);
+
+        case ID_SHAPING_WEIGHTS:
+            return read_shaping_info (wps, wpmd);
+
+        case ID_FLOAT_INFO:
+            return read_float_info (wps, wpmd);
+
+        case ID_INT32_INFO:
+            return read_int32_info (wps, wpmd);
+
+        case ID_CHANNEL_INFO:
+            return read_channel_info (wpc, wpmd);
+
+        case ID_CHANNEL_IDENTITIES:
+            return read_channel_identities (wpc, wpmd);
+
+        case ID_CONFIG_BLOCK:
+            return read_config_info (wpc, wpmd);
+
+        case ID_NEW_CONFIG_BLOCK:
+            return read_new_config_info (wpc, wpmd);
+
+        case ID_SAMPLE_RATE:
+            return read_sample_rate (wpc, wpmd);
+
+        case ID_WV_BITSTREAM:
+            return init_wv_bitstream (wps, wpmd);
+
+        case ID_WVC_BITSTREAM:
+            return init_wvc_bitstream (wps, wpmd);
+
+        case ID_WVX_BITSTREAM:
+            return init_wvx_bitstream (wps, wpmd);
+
+        case ID_DSD_BLOCK:
+#ifdef ENABLE_DSD
+            return init_dsd_block (wpc, wpmd);
+#else
+            strcpy (wpc->error_message, "not configured to handle DSD WavPack files!");
+            return FALSE;
+#endif
+
+        case ID_ALT_HEADER: case ID_ALT_TRAILER:
+            if (!(wpc->open_flags & OPEN_ALT_TYPES))
+                return TRUE;
+
+        case ID_RIFF_HEADER: case ID_RIFF_TRAILER:
+            return read_wrapper_data (wpc, wpmd);
+
+        case ID_ALT_MD5_CHECKSUM:
+            if (!(wpc->open_flags & OPEN_ALT_TYPES))
+                return TRUE;
+
+        case ID_MD5_CHECKSUM:
+            if (wpmd->byte_length == 16) {
+                memcpy (wpc->config.md5_checksum, wpmd->data, 16);
+                wpc->config.flags |= CONFIG_MD5_CHECKSUM;
+                wpc->config.md5_read = 1;
+            }
+
+            return TRUE;
+
+        case ID_ALT_EXTENSION:
+            if (wpmd->byte_length && wpmd->byte_length < sizeof (wpc->file_extension)) {
+                memcpy (wpc->file_extension, wpmd->data, wpmd->byte_length);
+                wpc->file_extension [wpmd->byte_length] = 0;
+            }
+
+            return TRUE;
+
+        // we don't actually verify the checksum here (it's done right after the
+        // block is read), but it's a good indicator of version 5 files
+
+        case ID_BLOCK_CHECKSUM:
+            wpc->version_five = 1;
+            return TRUE;
+
+        default:
+            return (wpmd->id & ID_OPTIONAL_DATA) ? TRUE : FALSE;
+    }
+}
+
+//////////////////////////////// bitstream management ///////////////////////////////
+
+// Open the specified BitStream and associate with the specified buffer.
+
+static void bs_read (Bitstream *bs);
+
+static void bs_open_read (Bitstream *bs, void *buffer_start, void *buffer_end)
+{
+    bs->error = bs->sr = bs->bc = 0;
+    bs->ptr = ((bs->buf = (uint16_t *)buffer_start) - 1);
+    bs->end = (uint16_t *)buffer_end;
+    bs->wrap = bs_read;
+}
+
+// This function is only called from the getbit() and getbits() macros when
+// the BitStream has been exhausted and more data is required. Sinve these
+// bistreams no longer access files, this function simple sets an error and
+// resets the buffer.
+
+static void bs_read (Bitstream *bs)
+{
+    bs->ptr = bs->buf;
+    bs->error = 1;
+}
+
+// This function is called to close the bitstream. It returns the number of
+// full bytes actually read as bits.
+
+uint32_t bs_close_read (Bitstream *bs)
+{
+    uint32_t bytes_read;
+
+    if (bs->bc < sizeof (*(bs->ptr)) * 8)
+        bs->ptr++;
+
+    bytes_read = (uint32_t)(bs->ptr - bs->buf) * sizeof (*(bs->ptr));
+
+    if (!(bytes_read & 1))
+        ++bytes_read;
+
+    CLEAR (*bs);
+    return bytes_read;
+}
+
+// Normally the trailing wrapper will not be available when a WavPack file is first
+// opened for reading because it is stored in the final block of the file. This
+// function forces a seek to the end of the file to pick up any trailing wrapper
+// stored there (then use WavPackGetWrapper**() to obtain). This can obviously only
+// be used for seekable files (not pipes) and is not available for pre-4.0 WavPack
+// files.
+
+void WavpackSeekTrailingWrapper (WavpackContext *wpc)
+{
+    if ((wpc->open_flags & OPEN_WRAPPER) &&
+        wpc->reader->can_seek (wpc->wv_in) && !wpc->stream3)
+            seek_eof_information (wpc, NULL, TRUE);
+}
+
+// Get any MD5 checksum stored in the metadata (should be called after reading
+// last sample or an extra seek will occur). A return value of FALSE indicates
+// that no MD5 checksum was stored.
+
+int WavpackGetMD5Sum (WavpackContext *wpc, unsigned char data [16])
+{
+    if (wpc->config.flags & CONFIG_MD5_CHECKSUM) {
+        if (!wpc->config.md5_read && wpc->reader->can_seek (wpc->wv_in))
+            seek_eof_information (wpc, NULL, FALSE);
+
+        if (wpc->config.md5_read) {
+            memcpy (data, wpc->config.md5_checksum, 16);
+            return TRUE;
+        }
+    }
+
+    return FALSE;
+}
+
+// Read from current file position until a valid 32-byte WavPack 4.0 header is
+// found and read into the specified pointer. The number of bytes skipped is
+// returned. If no WavPack header is found within 1 meg, then a -1 is returned
+// to indicate the error. No additional bytes are read past the header and it
+// is returned in the processor's native endian mode. Seeking is not required.
+
+uint32_t read_next_header (WavpackStreamReader64 *reader, void *id, WavpackHeader *wphdr)
+{
+    unsigned char buffer [sizeof (*wphdr)], *sp = buffer + sizeof (*wphdr), *ep = sp;
+    uint32_t bytes_skipped = 0;
+    int bleft;
+
+    while (1) {
+        if (sp < ep) {
+            bleft = (int)(ep - sp);
+            memmove (buffer, sp, bleft);
+        }
+        else
+            bleft = 0;
+
+        if (reader->read_bytes (id, buffer + bleft, sizeof (*wphdr) - bleft) != sizeof (*wphdr) - bleft)
+            return -1;
+
+        sp = buffer;
+
+        if (*sp++ == 'w' && *sp == 'v' && *++sp == 'p' && *++sp == 'k' &&
+            !(*++sp & 1) && sp [2] < 16 && !sp [3] && (sp [2] || sp [1] || *sp >= 24) && sp [5] == 4 &&
+            sp [4] >= (MIN_STREAM_VERS & 0xff) && sp [4] <= (MAX_STREAM_VERS & 0xff) && sp [18] < 3 && !sp [19]) {
+                memcpy (wphdr, buffer, sizeof (*wphdr));
+                WavpackLittleEndianToNative (wphdr, WavpackHeaderFormat);
+                return bytes_skipped;
+            }
+
+        while (sp < ep && *sp != 'w')
+            sp++;
+
+        if ((bytes_skipped += (uint32_t)(sp - buffer)) > 1024 * 1024)
+            return -1;
+    }
+}
+
+// Compare the regular wv file block header to a potential matching wvc
+// file block header and return action code based on analysis:
+//
+//   0 = use wvc block (assuming rest of block is readable)
+//   1 = bad match; try to read next wvc block
+//  -1 = bad match; ignore wvc file for this block and backup fp (if
+//       possible) and try to use this block next time
+
+static int match_wvc_header (WavpackHeader *wv_hdr, WavpackHeader *wvc_hdr)
+{
+    if (GET_BLOCK_INDEX (*wv_hdr) == GET_BLOCK_INDEX (*wvc_hdr) &&
+        wv_hdr->block_samples == wvc_hdr->block_samples) {
+            int wvi = 0, wvci = 0;
+
+            if (wv_hdr->flags == wvc_hdr->flags)
+                return 0;
+
+            if (wv_hdr->flags & INITIAL_BLOCK)
+                wvi -= 1;
+
+            if (wv_hdr->flags & FINAL_BLOCK)
+                wvi += 1;
+
+            if (wvc_hdr->flags & INITIAL_BLOCK)
+                wvci -= 1;
+
+            if (wvc_hdr->flags & FINAL_BLOCK)
+                wvci += 1;
+
+            return (wvci - wvi < 0) ? 1 : -1;
+        }
+
+    if (((GET_BLOCK_INDEX (*wvc_hdr) - GET_BLOCK_INDEX (*wv_hdr)) << 24) < 0)
+        return 1;
+    else
+        return -1;
+}
+
+// Read the wvc block that matches the regular wv block that has been
+// read for the current stream. If an exact match is not found then
+// we either keep reading or back up and (possibly) use the block
+// later. The skip_wvc flag is set if not matching wvc block is found
+// so that we can still decode using only the lossy version (although
+// we flag this as an error). A return of FALSE indicates a serious
+// error (not just that we missed one wvc block).
+
+int read_wvc_block (WavpackContext *wpc)
+{
+    WavpackStream *wps = wpc->streams [wpc->current_stream];
+    int64_t bcount, file2pos;
+    WavpackHeader orig_wphdr;
+    WavpackHeader wphdr;
+    int compare_result;
+
+    while (1) {
+        file2pos = wpc->reader->get_pos (wpc->wvc_in);
+        bcount = read_next_header (wpc->reader, wpc->wvc_in, &wphdr);
+
+        if (bcount == (uint32_t) -1) {
+            wps->wvc_skip = TRUE;
+            wpc->crc_errors++;
+            return FALSE;
+        }
+
+        memcpy (&orig_wphdr, &wphdr, 32);       // save original header for verify step
+
+        if (wpc->open_flags & OPEN_STREAMING)
+            SET_BLOCK_INDEX (wphdr, wps->sample_index = 0);
+        else
+            SET_BLOCK_INDEX (wphdr, GET_BLOCK_INDEX (wphdr) - wpc->initial_index);
+
+        if (wphdr.flags & INITIAL_BLOCK)
+            wpc->file2pos = file2pos + bcount;
+
+        compare_result = match_wvc_header (&wps->wphdr, &wphdr);
+
+        if (!compare_result) {
+            wps->block2buff = (unsigned char *)malloc (wphdr.ckSize + 8);
+	    if (!wps->block2buff)
+	        return FALSE;
+
+            if (wpc->reader->read_bytes (wpc->wvc_in, wps->block2buff + 32, wphdr.ckSize - 24) !=
+                wphdr.ckSize - 24) {
+                    free (wps->block2buff);
+                    wps->block2buff = NULL;
+                    wps->wvc_skip = TRUE;
+                    wpc->crc_errors++;
+                    return FALSE;
+            }
+
+            memcpy (wps->block2buff, &orig_wphdr, 32);
+
+            // don't use corrupt blocks
+            if (!WavpackVerifySingleBlock (wps->block2buff, !(wpc->open_flags & OPEN_NO_CHECKSUM))) {
+                free (wps->block2buff);
+                wps->block2buff = NULL;
+                wps->wvc_skip = TRUE;
+                wpc->crc_errors++;
+                return TRUE;
+            }
+
+            wps->wvc_skip = FALSE;
+            memcpy (wps->block2buff, &wphdr, 32);
+            memcpy (&wps->wphdr, &wphdr, 32);
+            return TRUE;
+        }
+        else if (compare_result == -1) {
+            wps->wvc_skip = TRUE;
+            wpc->reader->set_pos_rel (wpc->wvc_in, -32, SEEK_CUR);
+            wpc->crc_errors++;
+            return TRUE;
+        }
+    }
+}
+
+// This function is used to seek to end of a file to obtain certain information
+// that is stored there at the file creation time because it is not known at
+// the start. This includes the MD5 sum and and trailing part of the file
+// wrapper, and in some rare cases may include the total number of samples in
+// the file (although we usually try to back up and write that at the front of
+// the file). Note this function restores the file position to its original
+// location (and obviously requires a seekable file). The normal return value
+// is TRUE indicating no errors, although this does not actually mean that any
+// information was retrieved. An error return of FALSE usually means the file
+// terminated unexpectedly. Note that this could be used to get all three
+// types of information in one go, but it's not actually used that way now.
+
+static int seek_eof_information (WavpackContext *wpc, int64_t *final_index, int get_wrapper)
+{
+    int64_t restore_pos, last_pos = -1;
+    WavpackStreamReader64 *reader = wpc->reader;
+    int alt_types = wpc->open_flags & OPEN_ALT_TYPES;
+    uint32_t blocks = 0, audio_blocks = 0;
+    void *id = wpc->wv_in;
+    WavpackHeader wphdr;
+
+    restore_pos = reader->get_pos (id);    // we restore file position when done
+
+    // start 1MB from the end-of-file, or from the start if the file is not that big
+
+    if (reader->get_length (id) > (int64_t) 1048576)
+        reader->set_pos_rel (id, -1048576, SEEK_END);
+    else
+        reader->set_pos_abs (id, 0);
+
+    // Note that we go backward (without parsing inside blocks) until we find a block
+    // with audio (careful to not get stuck in a loop). Only then do we go forward
+    // parsing all blocks in their entirety.
+
+    while (1) {
+        uint32_t bcount = read_next_header (reader, id, &wphdr);
+        int64_t current_pos = reader->get_pos (id);
+
+        // if we just got to the same place as last time, we're stuck and need to give up
+
+        if (current_pos == last_pos) {
+            reader->set_pos_abs (id, restore_pos);
+            return FALSE;
+        }
+
+        last_pos = current_pos;
+
+        // We enter here if we just read 1 MB without seeing any WavPack block headers.
+        // Since WavPack blocks are < 1 MB, that means we're in a big APE tag, or we got
+        // to the end-of-file.
+
+        if (bcount == (uint32_t) -1) {
+
+            // if we have not seen any blocks at all yet, back up almost 2 MB (or to the
+            // beginning of the file) and try again
+
+            if (!blocks) {
+                if (current_pos > (int64_t) 2000000)
+                    reader->set_pos_rel (id, -2000000, SEEK_CUR);
+                else
+                    reader->set_pos_abs (id, 0);
+
+                continue;
+            }
+
+            // if we have seen WavPack blocks, then this means we've done all we can do here
+
+            reader->set_pos_abs (id, restore_pos);
+            return TRUE;
+        }
+
+        blocks++;
+
+        // If the block has audio samples, calculate a final index, although this is not
+        // final since this may not be the last block with audio. On the other hand, if
+        // this block does not have audio, and we haven't seen one with audio, we have
+        // to go back some more.
+
+        if (wphdr.block_samples) {
+            if (final_index)
+                *final_index = GET_BLOCK_INDEX (wphdr) + wphdr.block_samples;
+
+            audio_blocks++;
+        }
+        else if (!audio_blocks) {
+            if (current_pos > (int64_t) 1048576)
+                reader->set_pos_rel (id, -1048576, SEEK_CUR);
+            else
+                reader->set_pos_abs (id, 0);
+
+            continue;
+        }
+
+        // at this point we have seen at least one block with audio, so we parse the
+        // entire block looking for MD5 metadata or (conditionally) trailing wrappers
+
+        bcount = wphdr.ckSize - sizeof (WavpackHeader) + 8;
+
+        while (bcount >= 2) {
+            unsigned char meta_id, c1, c2;
+            uint32_t meta_bc, meta_size;
+
+            if (reader->read_bytes (id, &meta_id, 1) != 1 ||
+                reader->read_bytes (id, &c1, 1) != 1) {
+                    reader->set_pos_abs (id, restore_pos);
+                    return FALSE;
+            }
+
+            meta_bc = c1 << 1;
+            bcount -= 2;
+
+            if (meta_id & ID_LARGE) {
+                if (bcount < 2 || reader->read_bytes (id, &c1, 1) != 1 ||
+                    reader->read_bytes (id, &c2, 1) != 1) {
+                        reader->set_pos_abs (id, restore_pos);
+                        return FALSE;
+                }
+
+                meta_bc += ((uint32_t) c1 << 9) + ((uint32_t) c2 << 17);
+                bcount -= 2;
+            }
+
+            meta_size = (meta_id & ID_ODD_SIZE) ? meta_bc - 1 : meta_bc;
+            meta_id &= ID_UNIQUE;
+
+            if (get_wrapper && (meta_id == ID_RIFF_TRAILER || (alt_types && meta_id == ID_ALT_TRAILER)) && meta_bc) {
+                wpc->wrapper_data = (unsigned char *)realloc (wpc->wrapper_data, wpc->wrapper_bytes + meta_bc);
+
+                if (!wpc->wrapper_data) {
+                    reader->set_pos_abs (id, restore_pos);
+                    return FALSE;
+                }
+
+                if (reader->read_bytes (id, wpc->wrapper_data + wpc->wrapper_bytes, meta_bc) == meta_bc)
+                    wpc->wrapper_bytes += meta_size;
+                else {
+                    reader->set_pos_abs (id, restore_pos);
+                    return FALSE;
+                }
+            }
+            else if (meta_id == ID_MD5_CHECKSUM || (alt_types && meta_id == ID_ALT_MD5_CHECKSUM)) {
+                if (meta_bc == 16 && bcount >= 16) {
+                    if (reader->read_bytes (id, wpc->config.md5_checksum, 16) == 16)
+                        wpc->config.md5_read = TRUE;
+                    else {
+                        reader->set_pos_abs (id, restore_pos);
+                        return FALSE;
+                    }
+                }
+                else
+                    reader->set_pos_rel (id, meta_bc, SEEK_CUR);
+            }
+            else
+                reader->set_pos_rel (id, meta_bc, SEEK_CUR);
+
+            bcount -= meta_bc;
+        }
+    }
+}
+
+// Quickly verify the referenced block. It is assumed that the WavPack header has been converted
+// to native endian format. If a block checksum is performed, that is done in little-endian
+// (file) format. It is also assumed that the caller has made sure that the block length
+// indicated in the header is correct (we won't overflow the buffer). If a checksum is present,
+// then it is checked, otherwise we just check that all the metadata blocks are formatted
+// correctly (without looking at their contents). Returns FALSE for bad block.
+
+int WavpackVerifySingleBlock (unsigned char *buffer, int verify_checksum)
+{
+    WavpackHeader *wphdr = (WavpackHeader *) buffer;
+    uint32_t checksum_passed = 0, bcount, meta_bc;
+    unsigned char *dp, meta_id, c1, c2;
+
+    if (strncmp (wphdr->ckID, "wvpk", 4) || wphdr->ckSize + 8 < sizeof (WavpackHeader))
+        return FALSE;
+
+    bcount = wphdr->ckSize - sizeof (WavpackHeader) + 8;
+    dp = (unsigned char *)(wphdr + 1);
+
+    while (bcount >= 2) {
+        meta_id = *dp++;
+        c1 = *dp++;
+
+        meta_bc = c1 << 1;
+        bcount -= 2;
+
+        if (meta_id & ID_LARGE) {
+            if (bcount < 2)
+                return FALSE;
+
+            c1 = *dp++;
+            c2 = *dp++;
+            meta_bc += ((uint32_t) c1 << 9) + ((uint32_t) c2 << 17);
+            bcount -= 2;
+        }
+
+        if (bcount < meta_bc)
+            return FALSE;
+
+        if (verify_checksum && (meta_id & ID_UNIQUE) == ID_BLOCK_CHECKSUM) {
+#ifdef BITSTREAM_SHORTS
+            uint16_t *csptr = (uint16_t*) buffer;
+#else
+            unsigned char *csptr = buffer;
+#endif
+            int wcount = (int)(dp - 2 - buffer) >> 1;
+            uint32_t csum = (uint32_t) -1;
+
+            if ((meta_id & ID_ODD_SIZE) || meta_bc < 2 || meta_bc > 4)
+                return FALSE;
+
+#ifdef BITSTREAM_SHORTS
+            while (wcount--)
+                csum = (csum * 3) + *csptr++;
+#else
+            WavpackNativeToLittleEndian ((WavpackHeader *) buffer, WavpackHeaderFormat);
+
+            while (wcount--) {
+                csum = (csum * 3) + csptr [0] + (csptr [1] << 8);
+                csptr += 2;
+            }
+
+            WavpackLittleEndianToNative ((WavpackHeader *) buffer, WavpackHeaderFormat);
+#endif
+
+            if (meta_bc == 4) {
+                if (*dp++ != (csum & 0xff) || *dp++ != ((csum >> 8) & 0xff) || *dp++ != ((csum >> 16) & 0xff) || *dp++ != ((csum >> 24) & 0xff))
+                    return FALSE;
+            }
+            else {
+                csum ^= csum >> 16;
+
+                if (*dp++ != (csum & 0xff) || *dp++ != ((csum >> 8) & 0xff))
+                    return FALSE;
+            }
+
+            checksum_passed++;
+        }
+
+        bcount -= meta_bc;
+        dp += meta_bc;
+    }
+
+    return (bcount == 0) && (!verify_checksum || !(wphdr->flags & HAS_CHECKSUM) || checksum_passed);
+}
diff --git a/third_party/wavpack/src/pack.c b/third_party/wavpack/src/pack.c
index eb4c418..84e884b 100644
--- a/third_party/wavpack/src/pack.c
+++ b/third_party/wavpack/src/pack.c
@@ -1,7 +1,7 @@
 ////////////////////////////////////////////////////////////////////////////
 //                           **** WAVPACK ****                            //
 //                  Hybrid Lossless Wavefile Compressor                   //
-//              Copyright (c) 1998 - 2006 Conifer Software.               //
+//              Copyright (c) 1998 - 2013 Conifer Software.               //
 //               MMX optimizations (c) 2006 Joachim Henke                 //
 //                          All Rights Reserved.                          //
 //      Distributed under the BSD Software License (see license.txt)      //
@@ -10,1074 +10,16 @@
 // pack.c
 
 // This module actually handles the compression of the audio data, except for
-// the entropy coding which is handled by the words? modules. For efficiency,
-// the conversion is isolated to tight loops that handle an entire buffer.
-
-#include "wavpack_local.h"
+// the entropy encoding which is handled by the write_words.c module. For better
+// efficiency, the conversion is isolated to tight loops that handle an entire
+// buffer.
 
 #include <stdlib.h>
-#include <stdio.h>
 #include <string.h>
 #include <math.h>
 
-#ifdef DEBUG_ALLOC
-#define malloc malloc_db
-#define realloc realloc_db
-#define free free_db
-void *malloc_db (uint32_t size);
-void *realloc_db (void *ptr, uint32_t size);
-void free_db (void *ptr);
-int32_t dump_alloc (void);
-#endif
-
-//////////////////////////////// local tables ///////////////////////////////
-
-// These two tables specify the characteristics of the decorrelation filters.
-// Each term represents one layer of the sequential filter, where positive
-// values indicate the relative sample involved from the same channel (1=prev),
-// 17 & 18 are special functions using the previous 2 samples, and negative
-// values indicate cross channel decorrelation (in stereo only).
-
-static const WavpackDecorrSpec fast_specs [] = {
-        { 1, 2,18,17 },  // 0
-        { 1, 1,17,17 },  // 1
-        { 0, 2,18,17 },  // 2
-        { 0, 1,17,17 },  // 3
-        { 1, 3, 1,18 },  // 4
-        { 1, 1,17, 1 },  // 5
-        { 0, 1, 1,17 },  // 6
-        { 0, 1,-2,17 },  // 7
-        { 0, 2,-1,17 },  // 8
-        { 1, 1,17, 2 },  // 9
-        { 0, 3,18,18 },  // 10
-        { 0, 1,17, 1 },  // 11
-        { 1, 6, 1, 2 },  // 12
-        { 1, 1,17, 3 },  // 13
-        { 0, 1,-2, 3 },  // 14
-        { 0, 1, 2,17 },  // 15
-        { 0, 1,18,-2 },  // 16
-        { 0, 1,-1,17 },  // 17
-        { 0, 1,18,17 },  // 18
-        { 0, 1,17, 2 },  // 19
-        { 1, 2,18,-2 },  // 20
-        { 1, 1, 1,17 },  // 21
-        { 0, 3,18, 2 },  // 22
-        { 0, 1,17,-2 },  // 23
-        { 0, 1,18,-2 },  // 24
-        { 1, 2,17,-3 },  // 25
-        { 0, 1,18, 3 },  // 26
-        { 0, 1,18,18 },  // 27
-        { 1, 1, 1, 3 },  // 28
-        { 1, 1,18, 3 },  // 29
-        { 1, 1, 1, 3 },  // 30
-        { 0, 2,18,17 },  // 31
-        { 1, 1, 1,17 },  // 32
-        { 1, 1,17, 3 },  // 33
-        { 0, 3,18,17 },  // 34
-        { 0, 1,18,18 },  // 35
-        { 1, 1, 1, 3 },  // 36
-        { 1, 1, 1,18 },  // 37
-        { 0, 1,18,-2 },  // 38
-        { 0, 2,18,17 },  // 39
-        { 0, 1,-1,18 },  // 40
-        { 1, 1,17, 3 },  // 41
-        { 0, 1,17, 2 },  // 42
-        { 0, 1,17, 3 },  // 43
-        { 1, 1,18, 2 },  // 44
-        { 1, 1,17,-2 },  // 45
-        { 0, 1, 1,-2 },  // 46
-        { 0, 2,18,17 },  // 47
-        { 0, 1,17,-2 },  // 48
-        { 1, 1,17,-2 },  // 49
-        { 0, 1,18, 3 },  // 50
-        { 0, 1, 2,17 },  // 51
-        { 1, 2,18,-3 },  // 52
-        { 1, 2, 1,18 },  // 53
-        { 1, 2,18, 2 },  // 54
-        { 0, 1,17,-1 },  // 55
-        { 0, 1,17,-2 },  // 56
-        { 1, 1,17,-2 },  // 57
-        { 1, 1, 1, 3 },  // 58
-        { 0, 1, 1,17 },  // 59
-        { 1, 2,18,-2 },  // 60
-        { 1, 2,17,-3 },  // 61
-        { 0, 2,18,17 },  // 62
-        { 0, 2,18,17 },  // 63
-        { 1, 1,17, 2 },  // 64
-        { 1, 2,18,18 },  // 65
-        { 0, 1,17, 2 },  // 66
-        { 0, 1,18,17 },  // 67
-        { 1, 1, 1,17 },  // 68
-        { 1, 1,17, 2 },  // 69
-        { 0, 2,18,18 },  // 70
-        { 0, 2,18,17 },  // 71
-        { 1, 2,17,-3 },  // 72
-        { 1, 6, 1, 2 },  // 73
-        { 0, 3,17,17 },  // 74
-        { 0, 1, 1,18 },  // 75
-        { 0, 1, 1,-2 },  // 76
-        { 1, 1,17, 2 },  // 77
-        { 0, 2,18,17 },  // 78
-        { 0, 2,18,17 },  // 79
-        { 1, 1,18, 3 },  // 80
-        { 1, 2,17,-3 },  // 81
-        { 0, 1,17, 2 },  // 82
-        { 0, 1,17, 3 },  // 83
-        { 0, 1,18,-2 },  // 84
-        { 1, 1,18,18 },  // 85
-        { 1, 6, 1, 2 },  // 86
-        { 0, 2,18,17 },  // 87
-        { 0, 2,18,17 },  // 88
-        { 0, 1,-1,17 },  // 89
-        { 1, 1,18, 3 },  // 90
-        { 0, 1,17,18 },  // 91
-        { 1, 1,17, 3 },  // 92
-        { 0, 1,18, 3 },  // 93
-        { 0, 2,18,17 },  // 94
-        { 0, 2,18,17 },  // 95
-        { 1, 2,18, 2 },  // 96
-        { 0, 1,-2, 3 },  // 97
-        { 0, 4,18,-1 },  // 98
-        { 0, 2,18,18 },  // 99
-        { 0, 1,-2, 3 },  // 100
-        { 1, 1,17,-2 },  // 101
-        { 0, 1,17, 3 },  // 102
-        { 0, 2,18,17 },  // 103
-        { 0, 2,-1,18 },  // 104
-        { 1, 1, 2,17 },  // 105
-        { 0, 2,17,-2 },  // 106
-        { 0, 1,17, 2 },  // 107
-        { 1, 2,18,-3 },  // 108
-        { 0, 1,17,-2 },  // 109
-        { 0, 2,18,17 },  // 110
-        { 0, 2,18,17 },  // 111
-        { 1, 1,17,-2 },  // 112
-        { 1, 2,17,-3 },  // 113
-        { 1, 1, 1, 3 },  // 114
-        { 1, 1, 2,17 },  // 115
-        { 1, 2,18, 2 },  // 116
-        { 1, 1, 2,17 },  // 117
-        { 1, 1,18, 2 },  // 118
-        { 0, 2,18,17 },  // 119
-        { 0, 2,18,17 },  // 120
-        { 0, 1,17,-2 },  // 121
-        { 0, 2,18,17 },  // 122
-        { 0, 2,17,-1 },  // 123
-        { 0, 2,18,-2 },  // 124
-        { 0, 2,18,17 },  // 125
-        { 0, 2,18,17 },  // 126
-        { 0, 2,18,17 },  // 127
-        { 1, 1, 1, 3 },  // 128
-        { 0, 2,-2,17 },  // 129
-        { 0, 2,18,-2 },  // 130
-        { 0, 2,17,-2 },  // 131
-        { 1, 1, 2,17 },  // 132
-        { 1, 1, 1, 3 },  // 133
-        { 0, 1, 2,17 },  // 134
-        { 0, 2,18,17 },  // 135
-        { 0, 3,-1,17 },  // 136
-        { 1, 1, 2,17 },  // 137
-        { 0, 2,18,18 },  // 138
-        { 0, 1,17, 2 },  // 139
-        { 1, 4,18,-3 },  // 140
-        { 1, 1,18, 1 },  // 141
-        { 0, 2,18,17 },  // 142
-        { 0, 2,18,17 },  // 143
-        { 1, 2,18,-1 },  // 144
-        { 0, 1,-1,18 },  // 145
-        { 1, 6, 1, 2 },  // 146
-        { 1, 1,17, 2 },  // 147
-        { 1, 4,18, 3 },  // 148
-        { 0, 1, 1,17 },  // 149
-        { 0, 1,18, 2 },  // 150
-        { 0, 2,18,17 },  // 151
-        { 0, 2,18,17 },  // 152
-        { 1, 2,17, 2 },  // 153
-        { 0, 2,18,-2 },  // 154
-        { 0, 1, 1,18 },  // 155
-        { 1, 2,18,-3 },  // 156
-        { 0, 2,18,17 },  // 157
-        { 0, 2,18,17 },  // 158
-        { 0, 2,18,17 },  // 159
-        { 1, 2,18,18 },  // 160
-        { 1, 3,17,17 },  // 161
-        { 0, 1,-2,17 },  // 162
-        { 0, 1,17,18 },  // 163
-        { 0, 1,-1, 3 },  // 164
-        { 1, 1, 2,17 },  // 165
-        { 0, 2,18,-1 },  // 166
-        { 0, 2,18,17 },  // 167
-        { 0, 2,18,17 },  // 168
-        { 1, 1,17,-2 },  // 169
-        { 1, 2,17, 2 },  // 170
-        { 1, 1,18, 3 },  // 171
-        { 0, 1,18, 2 },  // 172
-        { 1, 2,17,-3 },  // 173
-        { 0, 2,18,17 },  // 174
-        { 0, 2,18,17 },  // 175
-        { 0, 1,-2,17 },  // 176
-        { 0, 1,17,-1 },  // 177
-        { 0, 1,18,-1 },  // 178
-        { 0, 2,18,17 },  // 179
-        { 1, 2,17,-3 },  // 180
-        { 1, 1, 1,18 },  // 181
-        { 1, 3,18, 2 },  // 182
-        { 0, 2,18,17 },  // 183
-        { 0, 2,18,17 },  // 184
-        { 0, 2,18,17 },  // 185
-        { 0, 2,18,17 },  // 186
-        { 0, 3,18,18 },  // 187
-        { 0, 1, 1,-2 },  // 188
-        { 0, 2,18,17 },  // 189
-        { 0, 2,18,17 },  // 190
-        { 0, 2,18,17 },  // 191
-        { 1, 2,17,-3 },  // 192
-        { 1, 1,18,18 },  // 193
-        { 0, 2,18, 2 },  // 194
-        { 0, 1,17,18 },  // 195
-        { 1, 2,18, 2 },  // 196
-        { 1, 1,17,-2 },  // 197
-        { 0, 2,17,-1 },  // 198
-        { 0, 2,18,17 },  // 199
-        { 0, 2,18,17 },  // 200
-        { 0, 2,18,17 },  // 201
-        { 0, 1, 1,-2 },  // 202
-        { 0, 1,18, 1 },  // 203
-        { 1, 2,18,-2 },  // 204
-        { 0, 1,17, 2 },  // 205
-        { 0, 2,18,17 },  // 206
-        { 0, 2,18,17 },  // 207
-        { 1, 1,17, 3 },  // 208
-        { 0, 1,17,-1 },  // 209
-        { 0, 1,18, 2 },  // 210
-        { 1, 1,17, 3 },  // 211
-        { 1, 1,17,-2 },  // 212
-        { 0, 1,18,18 },  // 213
-        { 0, 2,18,17 },  // 214
-        { 0, 2,18,17 },  // 215
-        { 0, 2,18,17 },  // 216
-        { 0, 2,18,17 },  // 217
-        { 0, 2,18,17 },  // 218
-        { 1, 1,17,18 },  // 219
-        { 0, 1,-2, 3 },  // 220
-        { 0, 2,18,17 },  // 221
-        { 0, 2,18,17 },  // 222
-        { 0, 2,18,17 },  // 223
-        { 1, 2,18,-3 },  // 224
-        { 0, 2,18,17 },  // 225
-        { 0, 3,18, 2 },  // 226
-        { 0, 1, 1,18 },  // 227
-        { 0, 2,18,17 },  // 228
-        { 0, 1,17,-1 },  // 229
-        { 0, 2,18,17 },  // 230
-        { 0, 2,18,17 },  // 231
-        { 0, 2,18,17 },  // 232
-        { 0, 1,-2, 3 },  // 233
-        { 0, 3,17,17 },  // 234
-        { 0, 2,18,17 },  // 235
-        { 0, 2,18,17 },  // 236
-        { 1, 1,17, 2 },  // 237
-        { 0, 2,18,17 },  // 238
-        { 0, 2,18,17 },  // 239
-        { 1, 1,17, 2 },  // 240
-        { 0, 2,18,17 },  // 241
-        { 0, 2,18,17 },  // 242
-        { 0, 2,18,17 },  // 243
-        { 0, 2,18, 2 },  // 244
-        { 0, 2,18,17 },  // 245
-        { 0, 2,18,17 },  // 246
-        { 0, 2,18,17 },  // 247
-        { 0, 2,18,17 },  // 248
-        { 0, 2,18,17 },  // 249
-        { 0, 2,18,17 },  // 250
-        { 0, 2,18,17 },  // 251
-        { 0, 2,18,17 },  // 252
-        { 0, 2,18,17 },  // 253
-        { 0, 2,18,17 },  // 254
-        { 0, 2,18,17 },  // 255
-};
-
-static const WavpackDecorrSpec default_specs [] = {
-        { 1, 2,18,18, 2,17, 3 },         // 0
-        { 0, 2,18,17,-1, 3, 2 },         // 1
-        { 1, 1,17,18,18,-2, 2 },         // 2
-        { 0, 2,18,17, 3,-2,17 },         // 3
-        { 1, 2,18,17, 2,17, 3 },         // 4
-        { 0, 1,18,18,-1, 2,17 },         // 5
-        { 0, 1,17,17,-2, 2, 3 },         // 6
-        { 0, 1,18,-2,18, 2,17 },         // 7
-        { 1, 2,18,18,-1, 2, 3 },         // 8
-        { 0, 2,18,17, 3, 2, 5 },         // 9
-        { 1, 1,18,17,18, 2, 5 },         // 10
-        { 0, 1,17,17,-2, 2, 3 },         // 11
-        { 0, 1,18,-2,18, 2, 5 },         // 12
-        { 0, 1,17,-2,17, 2,-3 },         // 13
-        { 1, 1,17,-2,17, 1, 2 },         // 14
-        { 0, 1,17,17,-2, 2, 3 },         // 15
-        { 1, 1,18, 3, 1, 5, 4 },         // 16
-        { 1, 4,18,18, 2, 3,-2 },         // 17
-        { 0, 1, 1,-1,-1, 2,17 },         // 18
-        { 0, 2,18,17, 3, 2, 5 },         // 19
-        { 0, 1,18,18,18, 2,17 },         // 20
-        { 0, 1,18,17,-1, 2,18 },         // 21
-        { 1, 1,17, 3, 2, 1, 7 },         // 22
-        { 0, 2,18,-2,18, 2, 3 },         // 23
-        { 1, 3,18,-3,18, 2, 3 },         // 24
-        { 0, 3,18,17, 2, 3,17 },         // 25
-        { 1, 1,17,17, 2, 1, 4 },         // 26
-        { 0, 1,17,18,-2, 2,17 },         // 27
-        { 1, 1,18,18, 3, 5, 2 },         // 28
-        { 0, 1,17,17, 2,18, 4 },         // 29
-        { 0, 1,18,17, 1, 4, 6 },         // 30
-        { 1, 1, 3,17,18, 2,17 },         // 31
-        { 1, 1,17, 3, 2, 1, 7 },         // 32
-        { 0, 1,18,17,-1, 2, 3 },         // 33
-        { 1, 1,17,17, 2, 1, 4 },         // 34
-        { 1, 2,18,17,-1,17, 3 },         // 35
-        { 1, 2,18,17, 2, 3,-1 },         // 36
-        { 0, 2,18,18,-2, 2,17 },         // 37
-        { 0, 1,17,17, 2,18, 4 },         // 38
-        { 0, 5,-2,18,18,18, 2 },         // 39
-        { 1, 1,18,18,-1, 6, 3 },         // 40
-        { 0, 1,17,17,-2, 2, 3 },         // 41
-        { 1, 1,18,17,18, 2,17 },         // 42
-        { 0, 1,18,17, 4, 3, 1 },         // 43
-        { 0, 1,-2,18, 2, 2,18 },         // 44
-        { 1, 2,18,18,-2, 2,-1 },         // 45
-        { 1, 1,17,17, 2, 1, 4 },         // 46
-        { 0, 1,17,18,-2, 2,17 },         // 47
-        { 1, 1,17, 3, 2, 1, 7 },         // 48
-        { 1, 3,18,-3,18, 2, 3 },         // 49
-        { 1, 2,18,18,-2, 2,-1 },         // 50
-        { 1, 1,18,18, 3, 5, 2 },         // 51
-        { 0, 2,18,18,-1, 2,17 },         // 52
-        { 0, 1,18,-1,17,18, 2 },         // 53
-        { 0, 1,17,-1, 2, 3, 6 },         // 54
-        { 0, 1,18,-2,18, 2, 5 },         // 55
-        { 1, 2,18,18,-2, 2,-1 },         // 56
-        { 0, 3,18,18, 2, 3,17 },         // 57
-        { 0, 1,17,17, 2,18, 4 },         // 58
-        { 1, 1,17,-2,17, 1, 2 },         // 59
-        { 0, 1,-1, 3, 5, 4, 7 },         // 60
-        { 0, 3,18,18, 3, 2, 5 },         // 61
-        { 0, 1,17,17, 2,18, 4 },         // 62
-        { 0, 1,18,17,-2,18, 3 },         // 63
-        { 0, 2,18,18,-2, 2,17 },         // 64
-        { 0, 3,18,17,-2, 2, 3 },         // 65
-        { 1, 1,18,18,-2, 2,17 },         // 66
-        { 0, 1,18,17, 4, 3, 1 },         // 67
-        { 1, 2, 3,18,17, 2,17 },         // 68
-        { 1, 2,18,18, 2,-2,18 },         // 69
-        { 1, 2,18,18,-1,18, 2 },         // 70
-        { 0, 2,18,18,-2, 2,17 },         // 71
-        { 1, 3,18,18, 2, 3,-2 },         // 72
-        { 0, 3,18,18, 3, 2, 5 },         // 73
-        { 0, 1,18,-2,18, 2, 5 },         // 74
-        { 1, 1,17, 3, 2, 1, 7 },         // 75
-        { 1, 3,18,18,-2, 2,18 },         // 76
-        { 1, 1,17,18,18,-2, 2 },         // 77
-        { 0, 1,18,-2,18, 2, 5 },         // 78
-        { 0, 2,18,-2,18, 2, 3 },         // 79
-        { 0, 1,-1, 3, 4, 5, 7 },         // 80
-        { 1, 1,17,17, 2,-1, 7 },         // 81
-        { 0, 1,18,-1,-1, 2,-2 },         // 82
-        { 0, 2,18,17, 2, 3,17 },         // 83
-        { 0, 1,18,17, 2,18, 2 },         // 84
-        { 0, 2,18,17,-1, 2,17 },         // 85
-        { 0, 1, 1,18, 3, 2, 5 },         // 86
-        { 0, 2,18,-2, 4,18, 2 },         // 87
-        { 1, 1,18, 3, 1, 5, 4 },         // 88
-        { 0, 1,18,17,18, 2, 5 },         // 89
-        { 1, 1,18, 3, 1, 5, 4 },         // 90
-        { 0, 4,18,18,-2, 2,18 },         // 91
-        { 1, 1,18,18, 3, 2, 5 },         // 92
-        { 1, 1,17,17, 2, 1, 4 },         // 93
-        { 0, 2,18,18,-2,18, 2 },         // 94
-        { 0, 2,18,18,-2,18, 2 },         // 95
-        { 1, 1,18,18, 2, 1, 3 },         // 96
-        { 1, 1,17,17, 2, 1, 4 },         // 97
-        { 1, 2,17,17, 2,18, 3 },         // 98
-        { 0, 1,18,17, 1, 4, 6 },         // 99
-        { 1, 2,18,18,-2, 2,-1 },         // 100
-        { 0, 1,18,-2,18, 2, 5 },         // 101
-        { 1, 1,17, 2,18, 2,17 },         // 102
-        { 0, 2,18,18,-2,18, 2 },         // 103
-        { 0, 1,18,18, 3, 6,-1 },         // 104
-        { 0, 1,18,17, 2,18, 3 },         // 105
-        { 0, 1,18,17,-2, 2,17 },         // 106
-        { 1, 1, 3,17,18, 2,17 },         // 107
-        { 1, 3,18,-3,18, 2, 3 },         // 108
-        { 1, 3,18,18,-3,18, 2 },         // 109
-        { 1, 1,18, 3, 1, 5, 4 },         // 110
-        { 0, 1,17,-2,17, 2,-3 },         // 111
-        { 1, 1,18,18, 3, 5, 2 },         // 112
-        { 1, 2,18,18,-2, 2,-1 },         // 113
-        { 0, 1,18,-1,-1, 2,-2 },         // 114
-        { 1, 1,18, 3, 1, 5, 4 },         // 115
-        { 0, 3,18,17,-1, 2,17 },         // 116
-        { 1, 3,18,17, 2,18,-2 },         // 117
-        { 0, 2,18,18,-2,18, 2 },         // 118
-        { 1, 2,18,18,-2, 2,-1 },         // 119
-        { 1, 1,18, 3, 1, 5, 4 },         // 120
-        { 0, 4, 3,18,18, 2,17 },         // 121
-        { 0, 2,18,18,-2,18, 2 },         // 122
-        { 1, 1,18,17,-1,18, 2 },         // 123
-        { 0, 2,18,18,-2,18, 2 },         // 124
-        { 0, 2,18,18,-2,18, 2 },         // 125
-        { 0, 2,18,18,-2,18, 2 },         // 126
-        { 0, 2,18,18,-2,18, 2 },         // 127
-        { 1, 1,18,18,18, 3, 2 },         // 128
-        { 0, 1,17,-1, 2, 3, 6 },         // 129
-        { 0, 1,17,-1, 2, 3, 6 },         // 130
-        { 0, 2,18,17,-2, 3, 2 },         // 131
-        { 1, 3,18,17, 2,-2,18 },         // 132
-        { 0, 2,18,18, 2,17, 3 },         // 133
-        { 0, 1,18,18, 2,18,-2 },         // 134
-        { 0, 2,18,-2, 4,18, 2 },         // 135
-        { 0, 1,-2,18, 2, 2,18 },         // 136
-        { 0, 2,18,17, 3, 6, 2 },         // 137
-        { 0, 1,18,17,18, 2, 5 },         // 138
-        { 0, 3,18,18,-2, 3, 2 },         // 139
-        { 1, 1,18,18, 2,18, 5 },         // 140
-        { 0, 1,17,-1, 2, 3, 6 },         // 141
-        { 1, 4,18,18, 2, 3,-2 },         // 142
-        { 0, 2,18,17,18, 2,-2 },         // 143
-        { 0, 1, 1,18, 3, 2, 5 },         // 144
-        { 1, 4,18,-2,18, 2, 3 },         // 145
-        { 1, 2,18, 2,18, 3,-2 },         // 146
-        { 0, 2,18,18,18, 2, 4 },         // 147
-        { 0, 2, 3,17,18, 2,17 },         // 148
-        { 1, 1,18,-1,18, 2,17 },         // 149
-        { 1, 2,17,17, 2,18, 3 },         // 150
-        { 0, 2,18,17,-2, 3, 2 },         // 151
-        { 0, 1, 1,-1,-1, 2,17 },         // 152
-        { 0, 3, 3,18,18, 2,17 },         // 153
-        { 0, 1,18,-1,17,18, 2 },         // 154
-        { 0, 1,18,17, 2,18, 3 },         // 155
-        { 0, 2,18,18,-2,18, 2 },         // 156
-        { 0, 1,18,17, 2,18, 2 },         // 157
-        { 0, 2,18,18,-2,18, 2 },         // 158
-        { 0, 2,18,18,-2,18, 2 },         // 159
-        { 1, 2,17,17, 2,18, 3 },         // 160
-        { 0, 1,18,17,-2, 2, 3 },         // 161
-        { 0, 1,18,-2,18, 2, 5 },         // 162
-        { 1, 4,18,-2,18, 2, 3 },         // 163
-        { 1, 3,18,17, 2, 3, 6 },         // 164
-        { 0, 2,18,18, 2,17, 3 },         // 165
-        { 0, 2,18,17, 2,18, 2 },         // 166
-        { 0, 2,18,18,-2,18, 2 },         // 167
-        { 1, 1,18,18, 3, 5, 2 },         // 168
-        { 0, 2,18,18,-2, 2, 3 },         // 169
-        { 1, 2,18,17, 2,17, 3 },         // 170
-        { 0, 1,18,17, 2, 3,18 },         // 171
-        { 0, 2,18,18,-2,18, 2 },         // 172
-        { 1, 4,18,18, 2, 3,-2 },         // 173
-        { 0, 1,17,-2,17, 2,-3 },         // 174
-        { 0, 1,17,17, 2,18, 4 },         // 175
-        { 1, 1,18,18,18, 2, 4 },         // 176
-        { 1, 2,18, 2,18, 3,-2 },         // 177
-        { 1, 1,18,18,-2, 2,17 },         // 178
-        { 0, 2,18,18,-2,18, 2 },         // 179
-        { 0, 2,18,18, 2,17, 3 },         // 180
-        { 0, 2,18,18,18, 2, 4 },         // 181
-        { 0, 2,18,18,-2,18, 2 },         // 182
-        { 0, 2,18,17,-2, 3, 2 },         // 183
-        { 0, 1, 1,-1,-1, 2,17 },         // 184
-        { 1, 4,18,18, 2, 3,-2 },         // 185
-        { 0, 2,18,18,-2,18, 2 },         // 186
-        { 0, 1,18,-2,18, 3, 2 },         // 187
-        { 0, 2,18,18,-2,18, 2 },         // 188
-        { 0, 2,18,18,-2,18, 2 },         // 189
-        { 0, 2,18,18,-2,18, 2 },         // 190
-        { 0, 2,18,18,-2,18, 2 },         // 191
-        { 0, 1,18,18,-2, 2,17 },         // 192
-        { 0, 3,18,17, 2, 3,17 },         // 193
-        { 1, 2,18,18, 2,-2,18 },         // 194
-        { 0, 1,-1, 3, 5, 4, 7 },         // 195
-        { 1, 1,18, 3, 1, 5, 4 },         // 196
-        { 1, 1,18,18,-2,18, 3 },         // 197
-        { 0, 2,18,17,18, 2,-2 },         // 198
-        { 0, 2,18,18, 2,17, 3 },         // 199
-        { 1, 2,18, 2,18, 3,-2 },         // 200
-        { 1, 4,18,18, 2, 3,-2 },         // 201
-        { 1, 3,18,17, 2, 3, 6 },         // 202
-        { 0, 2,18,18,-2,18, 2 },         // 203
-        { 1, 2,18,17,-2,-1,17 },         // 204
-        { 0, 1,17,-1, 2, 3, 6 },         // 205
-        { 0, 2,18,18,-2,18, 2 },         // 206
-        { 0, 2,18,18,-2, 2, 3 },         // 207
-        { 1, 1,18,18,18, 2, 5 },         // 208
-        { 0, 1,17,17,-2, 2, 3 },         // 209
-        { 0, 2,18,18,-2,18, 2 },         // 210
-        { 0, 2,18,17, 3, 6, 2 },         // 211
-        { 0, 2,18,17,18, 2, 3 },         // 212
-        { 0, 3,18,17,-3,18, 2 },         // 213
-        { 0, 1,18,18,18, 2, 3 },         // 214
-        { 0, 1,18,-2,-3, 2, 6 },         // 215
-        { 0, 2,18,18,-2,18, 2 },         // 216
-        { 1, 1,18,17,18, 2, 5 },         // 217
-        { 0, 2,18,18,-2,18, 2 },         // 218
-        { 0, 2,18,18,-2,18, 2 },         // 219
-        { 1, 1,18,17,18, 2, 5 },         // 220
-        { 0, 2,18,18,-2,18, 2 },         // 221
-        { 0, 2,18,18,-2,18, 2 },         // 222
-        { 0, 2,18,18,-2,18, 2 },         // 223
-        { 0, 1,18,18,18, 2, 3 },         // 224
-        { 1, 1,17,-2,17, 1, 2 },         // 225
-        { 1, 1,17,17, 2,-1, 7 },         // 226
-        { 0, 1,18,17, 4, 3, 1 },         // 227
-        { 1, 3,18,-3,18, 2, 3 },         // 228
-        { 0, 1, 1,18, 3, 2, 5 },         // 229
-        { 0, 2,18,18,-2,18, 2 },         // 230
-        { 0, 2,18,18,-2,18, 2 },         // 231
-        { 0, 1,18,18, 3, 6, 2 },         // 232
-        { 0, 1,17,17, 2,18, 4 },         // 233
-        { 0, 1,17,17, 2,18, 4 },         // 234
-        { 0, 2,18,18,-2,18, 2 },         // 235
-        { 0, 2,18,18,-2,18, 2 },         // 236
-        { 0, 2,18,18,-2,18, 2 },         // 237
-        { 1, 2,18,-2,18, 3, 2 },         // 238
-        { 1, 1,17,-2,17, 1, 2 },         // 239
-        { 1, 1,18,18, 3, 2, 5 },         // 240
-        { 0, 1,18,18,-1, 2, 3 },         // 241
-        { 0, 2,18,18,-2,18, 2 },         // 242
-        { 0, 2,18,18,-2,18, 2 },         // 243
-        { 0, 1,18,17,18, 2, 5 },         // 244
-        { 0, 2,18,18,-2,18, 2 },         // 245
-        { 0, 2,18,18,-2,18, 2 },         // 246
-        { 0, 2,18,18,-2,18, 2 },         // 247
-        { 0, 2,18,18,-2,18, 2 },         // 248
-        { 0, 1, 3,18,18, 2,17 },         // 249
-        { 0, 2,18,18,-2,18, 2 },         // 250
-        { 0, 2,18,18,-2,18, 2 },         // 251
-        { 0, 2,18,18,-2,18, 2 },         // 252
-        { 0, 2,18,18,-2,18, 2 },         // 253
-        { 0, 2,18,18,-2,18, 2 },         // 254
-        { 0, 2,18,18,-2,18, 2 },         // 255
-};
-
-static const WavpackDecorrSpec high_specs [] = {
-        { 1, 2,18,18,18,-2, 2, 3, 5,-1,17, 4 },  // 0
-        { 0, 1,18,17,-2, 2,18, 3, 7, 2, 5, 4 },  // 1
-        { 1, 2, 1,18, 3, 6,-2,18, 2, 3, 4, 5 },  // 2
-        { 0, 2,18,18,-2, 2,18, 3, 6, 2,17, 4 },  // 3
-        { 1, 2,18,18, 2,18, 3, 2,-1, 4,18, 5 },  // 4
-        { 1, 1, 7, 6, 5, 3, 4, 2, 5, 4, 3, 7 },  // 5
-        { 1, 1,17, 3,18, 7, 2, 6, 1, 4, 3, 5 },  // 6
-        { 1, 1,-2,18,18,18, 3,-2, 6, 5, 2, 1 },  // 7
-        { 1, 2,18,18,-1,18, 2, 3, 6,-2,17, 5 },  // 8
-        { 0, 1,17,17,18, 3, 6, 4, 5, 2,18,-2 },  // 9
-        { 1, 2, 1,18,-2, 3, 5, 2, 4,-1, 6, 1 },  // 10
-        { 0, 2,18,18, 3, 6,18, 2, 4, 8, 5, 3 },  // 11
-        { 0, 1,-2, 1,18, 2,-2, 7,18, 2,-1, 5 },  // 12
-        { 1, 1, 4, 3, 8, 1, 5, 2, 5, 6, 2, 8 },  // 13
-        { 1, 1,17,18, 2, 6, 3, 4,-1, 1, 8, 6 },  // 14
-        { 0, 1,18,18, 3, 6, 3,-2, 2, 5,-1, 1 },  // 15
-        { 0, 1,18,18,17,-1, 2,-2,18, 3, 4, 5 },  // 16
-        { 1, 2,18,17, 2,-2,18, 3, 5, 7, 2, 4 },  // 17
-        { 1, 2,18,18, 3, 6,-2,18, 2, 5, 8, 3 },  // 18
-        { 0, 1,18,17, 2,18,18, 2, 6, 5,17, 7 },  // 19
-        { 1, 2,18,17, 2,18, 3, 2, 6,18,-1, 4 },  // 20
-        { 1, 1, 5, 3, 6, 5, 3, 4, 1, 2, 4, 7 },  // 21
-        { 1, 1, 5, 3, 6, 5, 3, 4, 1, 2, 4, 7 },  // 22
-        { 0, 1,-2,18,18,18,-2, 3, 2, 4, 6, 5 },  // 23
-        { 1, 2,18,17,-3, 3,-1,18, 2, 3, 6, 5 },  // 24
-        { 0, 1,17,18, 7, 3,-2, 7, 1, 2, 4, 5 },  // 25
-        { 1, 1, 2,18,18,-2, 2, 4,-1,18, 3, 6 },  // 26
-        { 0, 3, 1,18, 4, 3, 5, 2, 4,18, 2, 3 },  // 27
-        { 0, 1,-2,18, 2,18, 3, 7,18, 2, 6,-2 },  // 28
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 29
-        { 1, 1,18,18, 5, 4, 6, 4, 5, 1, 4, 3 },  // 30
-        { 1, 1,18, 3, 6, 5, 7, 8, 2, 3, 1,-1 },  // 31
-        { 1, 1,18,18,18, 2,-2, 3, 5,18, 2, 8 },  // 32
-        { 0, 2,18,17,-2, 2, 3,18,-3, 5, 2, 7 },  // 33
-        { 1, 1, 1, 1,-1, 8,17, 3,-2, 2, 6,17 },  // 34
-        { 0, 2,18,18,17, 2,-2, 3, 2, 4,18, 5 },  // 35
-        { 1, 1,17,18, 2,-1, 5, 7,18, 3, 4, 6 },  // 36
-        { 1, 1, 5, 4, 5,17, 3, 6, 3, 4, 7, 2 },  // 37
-        { 0, 1,17, 3, 1, 7, 4, 2, 5,-2,18, 6 },  // 38
-        { 0, 1,17,18, 2,18, 4, 3, 5, 7,-3, 6 },  // 39
-        { 1, 2,17,17,-3,-2, 2, 8,18,-1, 3, 5 },  // 40
-        { 0, 1,17,17,18, 2, 3, 6,-2, 8, 1, 7 },  // 41
-        { 1, 1, 1, 2, 6,-2,18, 2, 5,-3, 7,-2 },  // 42
-        { 0, 1,18,18, 3,18, 6, 8,-2, 2, 3, 5 },  // 43
-        { 0, 1,18,17, 2,18,-2, 3, 7, 6, 2, 4 },  // 44
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 45
-        { 1, 1,18,18, 2,-1, 3, 6, 1, 3, 4, 8 },  // 46
-        { 0, 1,18,18, 3, 6, 5, 3,-2, 2,18,-1 },  // 47
-        { 0, 1,18,17,-3,18, 2, 4,-2, 3, 6,17 },  // 48
-        { 1, 3, 1, 2,17, 3,18, 7,-1, 5, 2, 4 },  // 49
-        { 1, 1,18, 3,18, 6, 8,18,-2, 5, 7, 2 },  // 50
-        { 0, 1,17, 2,18, 6, 3, 2, 5, 4, 8, 1 },  // 51
-        { 0, 1,18,17,-1, 2, 3,18,18, 2, 3,17 },  // 52
-        { 1, 1,18, 7, 6, 5, 5, 3, 1, 4, 2, 4 },  // 53
-        { 1, 1, 6,17, 3, 8, 1, 5, 7,-1, 2, 1 },  // 54
-        { 1, 1,18,-2,18, 3,-2, 2, 7, 4, 6,18 },  // 55
-        { 1, 3,18,-3,18, 2, 3,18,-1, 7, 2, 5 },  // 56
-        { 0, 2,18,-2, 7, 1, 3, 2, 4, 6,-3, 7 },  // 57
-        { 1, 1,18,-2, 2,-3,18,-2,17,-1, 4, 2 },  // 58
-        { 0, 3,17,17, 2, 5, 3, 7,18, 6, 4, 2 },  // 59
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 60
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 61
-        { 1, 1,18,17, 4, 6, 6, 4, 5, 3, 4, 1 },  // 62
-        { 0, 1,18, 5, 3, 6, 2, 3, 8, 1, 3, 7 },  // 63
-        { 1, 2,18,17,-2, 2,18, 3, 5, 7,-1, 2 },  // 64
-        { 0, 1, 1,18,18, 3, 6,-1, 4, 8, 5, 2 },  // 65
-        { 1, 1, 1, 5, 3, 4, 1, 1, 3, 5, 7, 3 },  // 66
-        { 0, 1, 3,18,18, 2,18,18,-1, 2, 3,18 },  // 67
-        { 1, 2,18,18,-1,18, 2, 3, 4, 6,18, 5 },  // 68
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 69
-        { 1, 1,18, 3, 1, 4, 5, 2, 7, 1, 3, 6 },  // 70
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 71
-        { 1, 2,18,18,-1,18, 2, 3, 5,-2, 6, 8 },  // 72
-        { 1, 1,17,18, 4, 8, 3, 2, 5, 2, 7, 6 },  // 73
-        { 1, 4, 1, 2, 5,18,-2, 2, 3, 7,-1, 4 },  // 74
-        { 0, 2,18,17,-1, 3, 6,18, 2, 3, 7, 5 },  // 75
-        { 0, 1,-2,18, 2,-3, 6,18, 4, 3,-2, 5 },  // 76
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 77
-        { 0, 1,17,17, 6, 2, 4, 8, 3, 5,-1,17 },  // 78
-        { 1, 1,18, 3,18, 6, 8,18,-2, 5, 7, 2 },  // 79
-        { 1, 2,17,17,-3, 2,18,-2, 8, 3, 6,-1 },  // 80
-        { 1, 1,18,-2,17,18, 2, 3,-2, 6, 5, 4 },  // 81
-        { 1, 2,18,17,-1, 3,18, 2, 5, 3, 6,-3 },  // 82
-        { 0, 1,18,17, 2,18, 7,18, 2, 4, 3,17 },  // 83
-        { 1, 3,18,18, 5, 6, 4, 3, 4,18, 6, 5 },  // 84
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 85
-        { 1, 1, 7, 6, 5, 3, 4, 2, 5, 4, 3, 7 },  // 86
-        { 0, 1,-2,18,18,18, 3, 6, 4, 2, 5, 2 },  // 87
-        { 0, 3,18,17,-3,18, 3, 2, 5,-1,17, 3 },  // 88
-        { 1, 1,17,18, 7, 3, 1, 7, 4, 2, 6, 5 },  // 89
-        { 1, 1,18, 2,-2,-1,18, 5, 3,-2, 1, 2 },  // 90
-        { 0, 3,18,18,-1, 3, 2, 7, 5,18, 4, 3 },  // 91
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 92
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 93
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 94
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 95
-        { 1, 1,17,18, 2,-2, 4, 8,18, 3, 6, 5 },  // 96
-        { 0, 2,18,17, 3, 5,-2, 7, 2,18, 3,-1 },  // 97
-        { 1, 1,18, 2,-2,-1,18, 5, 3,-2, 1, 2 },  // 98
-        { 0, 2, 3,17,18,18, 2, 5, 7, 6,18, 3 },  // 99
-        { 1, 1,17,18,18, 4, 3, 2,18, 7, 8,-1 },  // 100
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 101
-        { 0, 1,17, 1, 2, 3, 5, 6, 1, 4, 8,17 },  // 102
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 103
-        { 0, 2,18,17,-1,18,-3, 2, 8, 3, 6,17 },  // 104
-        { 1, 1,17,17, 1, 2, 4, 5,-1, 2, 1, 6 },  // 105
-        { 1, 1, 1, 2, 6,-2,18, 2,-3, 3,-2, 5 },  // 106
-        { 0, 1,18, 3,18, 6,18, 5, 2, 4,-1, 8 },  // 107
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 108
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 109
-        { 1, 1,18,18,-1, 2,18, 3, 6, 4,-2, 7 },  // 110
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 111
-        { 0, 2,-1,18,18,18, 2,-2, 4, 7, 2, 3 },  // 112
-        { 0, 3, 3,17,-2, 5, 2, 7,18, 6, 4, 5 },  // 113
-        { 0, 1,17, 6,18, 3, 8, 4, 5, 3, 8,18 },  // 114
-        { 0, 2,18, 2, 6, 2,18, 3, 2, 4, 5, 8 },  // 115
-        { 0, 1, 3,18,18, 2,18,-1, 2,18, 2,17 },  // 116
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 117
-        { 0, 1, 3, 6,17,-2, 5, 1, 2, 7, 4, 8 },  // 118
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 119
-        { 1, 3, 3,18,17, 5, 6, 2, 7,-2, 8,18 },  // 120
-        { 1, 1,18,-1, 3, 1, 7, 2,-1, 4, 6,17 },  // 121
-        { 1, 1,18, 2,-2,-1,18, 5, 3,-2, 1, 2 },  // 122
-        { 0, 2,18, 1, 2,18, 3, 6, 5, 2, 4, 8 },  // 123
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 124
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 125
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 126
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 127
-        { 1, 1,17,-2, 2,18,18, 8, 5, 3, 2, 6 },  // 128
-        { 0, 1,18,17, 2,18, 3, 2, 7,-2,18, 4 },  // 129
-        { 1, 2, 1,18, 2, 3,-1, 5, 6, 4, 7,17 },  // 130
-        { 0, 2,18,17, 3, 6,-2, 2, 3, 8, 5,17 },  // 131
-        { 0, 2,18,18, 3, 2,18,-1, 2, 4, 3,17 },  // 132
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 133
-        { 1, 2,17,-1,18, 2, 3,-2, 5,18, 2, 7 },  // 134
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 135
-        { 1, 2,18,-3,18, 2, 3,-2,18, 5, 6,-3 },  // 136
-        { 0, 2,18,17, 3, 5,-2, 7, 2,18, 3,-1 },  // 137
-        { 1, 1, 1,18,-1, 2, 3, 1,-2, 8, 2, 5 },  // 138
-        { 0, 1,18,18, 3, 6,18, 2, 3, 4, 8, 5 },  // 139
-        { 0, 1,-2, 1,18, 2,-2, 5, 7,18, 2,-1 },  // 140
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 141
-        { 1, 1,17,18,-1, 2, 8, 3, 4, 5, 1, 7 },  // 142
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 143
-        { 0, 2,18,18,-1, 2,18, 3,-2, 5, 4, 2 },  // 144
-        { 1, 1,18,17, 2,18, 3, 8, 5, 2, 7,17 },  // 145
-        { 0, 1,18,18, 3,18, 6, 8,-2, 2, 3, 5 },  // 146
-        { 0, 1,18,18, 2,18, 2, 6,18, 2,17, 7 },  // 147
-        { 1, 3,18,17,18, 2, 8,18, 5,-1, 3, 6 },  // 148
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 149
-        { 1, 1,18, 7, 6, 5, 5, 3, 1, 4, 2, 4 },  // 150
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 151
-        { 1, 2,18,17,-1, 3, 6,18, 2, 5, 8, 3 },  // 152
-        { 0, 1,17,18,18, 4, 7, 2, 3,-2,18, 5 },  // 153
-        { 1, 2,18, 1, 2, 6, 2, 5,18, 2, 4, 8 },  // 154
-        { 0, 4,18, 4, 1, 2, 3, 5, 4, 1, 2, 6 },  // 155
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 156
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 157
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 158
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 159
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 160
-        { 0, 2,18,17, 2,-1,18, 3,-3, 5, 2, 4 },  // 161
-        { 0, 1,17,17, 3, 6, 3, 5,-2, 2,18,-1 },  // 162
-        { 0, 2,18,18, 3,-2,18, 2,-3, 5, 3, 6 },  // 163
-        { 1, 1,17,17, 2, 4, 1, 3, 5, 2, 6,-3 },  // 164
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 165
-        { 0, 1,17, 1, 3, 2, 7, 1, 6, 3, 4, 8 },  // 166
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 167
-        { 0, 1,17,-1,18, 2, 1, 5, 3, 8,-1,-2 },  // 168
-        { 1, 1,17,18,-1, 8, 2, 5, 3, 4, 1, 6 },  // 169
-        { 1, 2, 1,18, 3,-1, 5, 1, 2, 4, 7, 6 },  // 170
-        { 0, 1,18,18, 3, 6, 5, 3,-2, 2,18,-1 },  // 171
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 172
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 173
-        { 0, 1, 1,18,-1, 3, 8, 5, 6, 1, 2, 3 },  // 174
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 175
-        { 0, 2,18,18, 2, 3, 6,18,-1, 4, 2, 3 },  // 176
-        { 1, 1, 1, 3, 5,18, 2, 6, 7, 2, 3, 1 },  // 177
-        { 1, 1, 1, 3, 8,18, 5, 2, 7, 1, 3,-2 },  // 178
-        { 0, 2,17, 2,18, 3, 6, 2, 4, 5, 8, 3 },  // 179
-        { 0, 1,18,17, 2,18, 3, 2, 7,-2,18, 4 },  // 180
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 181
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 182
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 183
-        { 1, 2,18,-3,18,-1, 3,-2, 5, 7, 1, 2 },  // 184
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 185
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 186
-        { 0, 3,18,18, 2, 6,18, 5,18, 2, 3,17 },  // 187
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 188
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 189
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 190
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 191
-        { 1, 3, 1,-1, 1, 3,-2, 2, 5, 7,-3,18 },  // 192
-        { 1, 2,18, 7, 3,-3, 2, 8, 2, 5, 4,17 },  // 193
-        { 1, 1, 1, 4, 5, 1, 3, 4, 6, 7, 8, 3 },  // 194
-        { 0, 1,18,17, 2,18,-1, 2, 3,18, 2, 4 },  // 195
-        { 0, 2,18,18,-2,18, 2, 3, 4, 7, 5,17 },  // 196
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 197
-        { 1, 1,17,18, 2, 1, 3, 2, 5, 1, 2, 3 },  // 198
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 199
-        { 0, 2,18,18,-1, 2, 3, 5, 8, 6, 1,-2 },  // 200
-        { 0, 1,17,18, 8, 3, 4, 6, 5, 2, 8, 7 },  // 201
-        { 1, 2, 1, 3,-2,18, 2, 5, 1, 7,-1,-2 },  // 202
-        { 0, 3,18,17,-1, 3,18, 2, 3, 6, 4,17 },  // 203
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 204
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 205
-        { 1, 2,18,18, 4,18, 6, 7, 8, 3,18, 2 },  // 206
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 207
-        { 0, 2,17,-3,17, 2,-2, 8, 3,18, 4,-3 },  // 208
-        { 1, 1,18,17, 3, 5, 6, 2, 8, 1, 3, 7 },  // 209
-        { 0, 1,18,18, 3, 6, 5, 3,-2, 2,18,-1 },  // 210
-        { 0, 3,18,18, 2, 6,18, 5,18, 2, 3,17 },  // 211
-        { 1, 1,18,18, 5, 4, 6, 4, 5, 1, 4, 3 },  // 212
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 213
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 214
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 215
-        { 0, 2, 3,17,18,-3, 2, 5,18, 6,-1, 7 },  // 216
-        { 1, 1,17,18, 3, 2, 5,-1, 6, 8, 4, 7 },  // 217
-        { 1, 1,18, 1,-2, 3, 2, 1, 7, 6, 3, 4 },  // 218
-        { 0, 3, 1, 2,17, 3,18, 2, 7, 5, 4,-1 },  // 219
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 220
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 221
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 222
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 223
-        { 1, 1,17,-2, 2,18,18, 8, 5, 3, 2, 6 },  // 224
-        { 0, 2,18, 5,18, 2, 3, 7,-2, 1, 6, 8 },  // 225
-        { 0, 1, 2,-1,18,-1, 2, 4,-3, 5,18, 3 },  // 226
-        { 0, 1, 3,17,18, 5, 2,18, 7, 3, 6, 5 },  // 227
-        { 1, 4, 1, 2, 5,18,-2, 2, 3, 7,-1, 4 },  // 228
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 229
-        { 0, 1, 1,18, 2, 1, 3, 4, 1, 5, 2, 7 },  // 230
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 231
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 232
-        { 0, 1,17,17,18, 2, 4, 5,18,-2, 6, 3 },  // 233
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 234
-        { 0, 2,18,18,-1, 3, 5, 6, 8,18, 2, 3 },  // 235
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 236
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 237
-        { 0, 1,18,18, 4, 6, 8,18, 7, 3, 2, 5 },  // 238
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 239
-        { 0, 2,-1,18,18,18, 2, 4,-2, 2, 3, 6 },  // 240
-        { 0, 2,18,-2, 7, 1, 3, 2, 4, 6,-3, 7 },  // 241
-        { 1, 1,17,18, 8, 3, 4, 6,-2, 5, 3, 8 },  // 242
-        { 0, 2,18, 1, 2, 6, 2, 8, 3,18, 5, 4 },  // 243
-        { 1, 1, 3,18,18, 2,18, 2,18, 3, 2,18 },  // 244
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 245
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 246
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 247
-        { 1, 1, 3,17,18, 5, 2, 6, 7, 1, 4, 8 },  // 248
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 249
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 250
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 251
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 252
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 253
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 254
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2, 8 },  // 255
-};
-
-static const WavpackDecorrSpec very_high_specs [] = {
-        { 1, 2,18,18, 2, 3,-2,18, 2, 4, 7, 5, 3, 6, 8,-1,18, 2 },        // 0
-        { 0, 1,18,18,-1,18, 2, 3, 4, 6, 5, 7,18,-3, 8, 2,-1, 3 },        // 1
-        { 1, 2, 1,18,-2, 4,18, 2, 3, 6,-1, 7, 5,-2,18, 8, 2, 4 },        // 2
-        { 0, 1,17,17, 2, 3, 4,18,-1, 5, 6, 7,18, 2, 8,17, 3,-2 },        // 3
-        { 1, 1,18,18, 2,18, 3, 2,18, 4,-1, 3,18, 2, 6, 8,17, 5 },        // 4
-        { 0, 2,18,17, 2, 3,-2, 5,18,-3, 2, 4, 7, 3, 6, 8, 5,17 },        // 5
-        { 1, 1,18,-2, 2,-3,18, 5,-2,18, 2, 3, 6, 2,17, 4, 7,-1 },        // 6
-        { 1, 1,17, 8,18, 3,-2, 2, 5, 4,18, 6, 3, 8, 7, 2, 5, 4 },        // 7
-        { 0, 2,18,17,-2, 2,18, 3, 2, 5,-3, 4, 7,18, 3, 8, 6, 2 },        // 8
-        { 1, 1, 3, 6, 5, 5, 1, 3, 7, 4, 2, 6, 4,18, 3, 7, 5, 6 },        // 9
-        { 1, 2, 1,18, 3, 2,-2, 1, 5, 4, 6, 2, 7, 1, 8, 3,-1, 1 },        // 10
-        { 0, 1,18,18, 2, 3, 6, 3, 5,-2, 2, 4,18, 3,-2,-1, 6, 7 },        // 11
-        { 0, 1,-2,18, 2,18, 7, 2, 6,-2, 3, 4,18,18, 2,-3, 8, 5 },        // 12
-        { 0, 2,18,18,18, 2, 4, 3,18, 5, 3, 6,-2, 2, 4,18, 8, 7 },        // 13
-        { 0, 1,-2, 1,18, 2,-2,18,-1, 5, 7, 2, 3, 4,18, 2, 6, 2 },        // 14
-        { 1, 1,17,18, 3, 2, 1, 7,-1, 2, 4, 3, 5, 6,-2,18, 7, 8 },        // 15
-        { 1, 1,18,18, 2,18, 3, 4, 6,-2,18, 5, 8, 2, 3, 7, 4,-1 },        // 16
-        { 0, 1,18,18,18,-1, 2, 3, 4, 6, 8,18, 3, 5, 2, 6, 7, 4 },        // 17
-        { 1, 1,17,-2,18,18, 2, 5, 3, 8, 2,-1, 6, 1, 3, 4, 7, 5 },        // 18
-        { 0, 1,17,17,18, 2, 3, 6,-2, 8, 1, 7, 5, 2, 3, 1, 4, 8 },        // 19
-        { 1, 1,17,17, 3, 2, 7, 1, 4, 3, 6, 2, 5,-2, 8, 7,18, 6 },        // 20
-        { 0, 1,18,17,-2, 2,18, 3,-3, 7, 6, 5, 2, 4,-1, 8, 3,17 },        // 21
-        { 1, 1, 2,18,18,-2, 2, 4,-1, 5,18, 3, 8, 6, 2, 7,17, 4 },        // 22
-        { 0, 1,17, 3, 6, 8, 5, 4, 3, 8, 1,18, 7, 2, 4, 5, 6, 3 },        // 23
-        { 1, 2,17,18, 4, 8, 3, 2, 5, 7, 6, 8, 2, 7,-2,18, 3, 4 },        // 24
-        { 1, 1, 6, 5, 5, 3, 4, 7, 3, 2, 4, 6, 3, 7, 1, 5, 2, 4 },        // 25
-        { 1, 1, 1,18,-1, 2, 1, 3, 8,-2, 2, 5, 6, 3, 8, 7,18, 4 },        // 26
-        { 0, 1, 1,17,-1,18, 3, 2, 5, 4, 6, 7, 8, 3, 4, 2, 1,-2 },        // 27
-        { 0, 1,18, 2,18,18, 2,18, 6,-2,18, 7, 5, 4, 3, 2,18,-2 },        // 28
-        { 0, 3, 1, 4,18, 3, 2, 4, 1, 5, 2, 3, 6,18, 8, 7, 2, 4 },        // 29
-        { 0, 1,17,-2, 1,-3, 2,18, 3,-2, 4,18, 3, 6, 7,-3, 2, 8 },        // 30
-        { 1, 1,17,18,18, 4, 2, 3, 7, 6,18, 8, 5,-1, 4, 2, 3,17 },        // 31
-        { 1, 2,18,-1,17,18, 2, 3,-2,18, 5, 8, 2, 4, 3, 7, 6,-1 },        // 32
-        { 1, 1,18,18,18,-2, 4, 2, 3,18, 5, 8, 2, 4, 6, 7,-2, 3 },        // 33
-        { 1, 2,18,18,-2,18,-1, 3, 2, 5,18,-2, 7, 2, 3, 4, 6, 8 },        // 34
-        { 0, 1,17,18,-1, 2, 4,18, 8, 3, 6, 5, 7,-3, 2, 4, 3,17 },        // 35
-        { 1, 1,18,18,17, 2,-1,18, 3, 2,18, 6, 5, 4,18, 7, 2,-1 },        // 36
-        { 0, 2, 1,18,-1,18, 3, 2, 4, 6,-3, 7,-1, 5, 1, 2, 3, 8 },        // 37
-        { 1, 1, 1,17,-2, 2,-3, 6, 3, 5, 1, 2, 7, 6, 8,-2, 4, 1 },        // 38
-        { 0, 1,17,-1, 5, 1, 4, 3, 6, 2,-2,18, 3, 2, 4, 5, 8,-1 },        // 39
-        { 0, 2,18,18,17, 2, 3,-2, 5,18, 2, 4, 7, 8, 6,17, 3, 5 },        // 40
-        { 1, 1, 1, 5, 1, 3, 4, 3, 7, 5, 1, 3, 6, 1, 2, 4, 3, 8 },        // 41
-        { 1, 2, 1,-1, 3, 2,18, 7,-2, 5, 2, 6, 4, 3,-1,18, 8, 7 },        // 42
-        { 0, 2,18,17, 3,18, 2, 5, 4, 3, 6, 2, 7, 8,18, 3, 4, 5 },        // 43
-        { 1, 1, 3, 6,17, 8, 7, 5,18,-1, 1, 2, 3, 4, 2, 6, 8, 1 },        // 44
-        { 0, 2,18,18, 3,-3,18, 2, 6, 5, 3, 7,18, 4,-2, 8, 2, 3 },        // 45
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 46
-        { 1, 1,17, 1, 7, 2, 3,18,-2, 3, 6, 4, 2, 7, 8, 5, 3,17 },        // 47
-        { 1, 1, 3, 6, 5, 5, 1, 3, 7, 4, 2, 6, 4,18, 3, 7, 5, 6 },        // 48
-        { 0, 1,18,18,18, 2, 4,-1,18, 8,-1, 2, 3, 4, 6,-2, 1, 7 },        // 49
-        { 1, 1,18,-2,17,18, 2, 6, 3,-2, 5, 4, 7, 1,-3, 8, 2, 6 },        // 50
-        { 0, 1,17,18,18, 4, 2, 7, 3, 6,-2,18, 8, 4, 5, 2, 7,17 },        // 51
-        { 1, 1,18,18, 5, 4, 6, 4, 1, 5, 4, 3, 2, 5, 6, 1, 4, 5 },        // 52
-        { 0, 1,18,18,-2,18, 2,-3, 3, 8, 5,18, 6, 4, 3,-1, 7, 2 },        // 53
-        { 1, 1,18, 2,-2,-3,18, 5, 2, 3,-2, 4, 6, 1,-3, 2, 7, 8 },        // 54
-        { 0, 1,18, 3, 5, 8, 2, 6, 7, 3, 1, 5, 2,-1, 8, 6, 7, 4 },        // 55
-        { 1, 1, 4, 3, 8, 1, 5, 6, 2, 5, 8,-2, 2, 7, 3,18, 5, 4 },        // 56
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 57
-        { 1, 1,17, 3,18,18, 7, 2, 4,18, 6, 2, 3,-1, 8, 5,18,-3 },        // 58
-        { 0, 1, 3,17,18, 2,18, 6, 7,-3,18, 2, 5, 6, 3, 8, 7,-1 },        // 59
-        { 1, 1,18,18, 2,18,18, 2,-1, 7, 3,18, 5, 2, 6, 4,-1,18 },        // 60
-        { 0, 3,18, 3, 4, 1, 5, 2,18, 4, 2, 3,18, 7, 6, 1, 2, 4 },        // 61
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 62
-        { 1, 1,17, 1,18, 2, 3, 6, 4, 5, 7,18, 3, 8, 2, 4,-2,17 },        // 63
-        { 1, 2,18,17, 2, 3, 5,18, 6,-2, 7, 3, 2, 4,18, 8,-1, 5 },        // 64
-        { 0, 2, 1,18,-1,18, 3, 2, 4, 6,-3, 7,-1, 5, 1, 2, 3, 8 },        // 65
-        { 1, 1, 1,18,-1, 8, 2, 6, 3,-2, 1, 2, 5, 4,-3, 8, 6, 3 },        // 66
-        { 0, 1,18,18, 2,18, 2,18, 7, 6,18, 2,-2, 3, 5, 4,18, 8 },        // 67
-        { 1, 2,18,17, 2, 3,18,-1, 2, 3, 6,18, 5, 4, 3, 7, 2, 8 },        // 68
-        { 1, 2,18,18, 3,-2, 4,18, 5, 7, 6, 2, 4,-3, 8, 5,18, 3 },        // 69
-        { 1, 1,17,-2,18,18, 2, 5, 3, 8, 2,-1, 6, 1, 3, 4, 7, 5 },        // 70
-        { 1, 1, 3,17,18, 5, 7, 2, 4, 6, 1, 8,-1, 3, 7, 4, 1, 2 },        // 71
-        { 0, 2, 1,-2, 2,18, 3, 5, 2, 4, 7,-1, 2, 3, 5,18,-2, 4 },        // 72
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 73
-        { 1, 1, 1, 2,-2, 6,18,-3, 2, 7, 3,-2, 5, 6, 1, 8, 2, 4 },        // 74
-        { 0, 1,18,18,18, 3,-2, 6,18, 2, 4, 3, 5, 8, 7, 6, 2,-2 },        // 75
-        { 1, 1, 1, 5, 1, 3, 4, 3, 7, 5, 1, 3, 6, 1, 2, 4, 3, 8 },        // 76
-        { 0, 1, 3,17,18, 2, 5,18, 6, 7, 5,-2, 2, 4,18, 3, 6, 8 },        // 77
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 78
-        { 0, 2,17,-1,18, 2, 4,-1, 8, 3,18, 7,-3, 4, 5, 1, 2,-2 },        // 79
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 8, 6, 4, 5, 7,-1 },        // 80
-        { 1, 1,18,18, 3, 6, 4, 8,-2, 2, 5, 3, 7,18, 6, 8, 4, 2 },        // 81
-        { 1, 1,17,18,18,-2, 5, 2, 3, 1, 4,-1, 8, 6, 5, 3, 2,18 },        // 82
-        { 1, 1,17,17, 1, 2, 4, 5, 2, 6,-1, 3, 1, 1,-2, 4, 2, 7 },        // 83
-        { 1, 1,17, 1, 7, 2, 3,18,-2, 3, 6, 4, 2, 7, 8, 5, 3,17 },        // 84
-        { 0, 1,18,17,-2,-3, 1, 2, 3, 2, 5, 4, 7,-3, 6,-2, 2, 1 },        // 85
-        { 1, 1, 1, 3, 5,18, 1, 2, 7, 3, 6, 2, 5, 8,-1, 1, 4, 7 },        // 86
-        { 1, 1,17, 3, 6, 8, 1, 4, 5, 3,-2, 7, 2, 8, 5, 6,18, 3 },        // 87
-        { 1, 1,17,18, 2, 4, 8,-2, 3, 1, 5, 6, 7, 1, 2, 3, 4, 7 },        // 88
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 89
-        { 1, 1, 3, 1, 8,18, 5, 2, 3,18, 6, 7,-2, 4, 3, 2, 8,18 },        // 90
-        { 0, 1,18,17, 2,18, 3, 4,-1,18, 7, 6, 2, 8, 4,18,18, 5 },        // 91
-        { 0, 1,18,18, 2,18,18, 2, 7,-2, 6, 5, 4, 3,18, 3, 2,17 },        // 92
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 93
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 94
-        { 1, 1,17, 8,18, 3, 2, 1, 5, 4, 6,-1, 3,-3, 8,18, 7, 2 },        // 95
-        { 1, 2,18,17,18, 2, 3, 5,-2,18, 6,-1, 2, 3, 7, 4, 8,17 },        // 96
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 8, 6, 4, 5, 7,-1 },        // 97
-        { 1, 2,18,18,-2,17, 2,18, 3, 4,18, 8, 7,-1, 2, 4, 5,17 },        // 98
-        { 0, 2,17,-3,17, 3, 2,-2,18, 8, 4,-3, 2,18, 5, 3,-2, 6 },        // 99
-        { 0, 1,18,18, 2,18,18, 2, 7,-2, 6, 5, 4, 3,18, 3, 2,17 },        // 100
-        { 0, 2, 1,18,-1, 3, 5, 2,-3,18, 7, 3,-1, 6, 4, 2,17, 5 },        // 101
-        { 1, 1,17,-2,17, 2,-3, 1, 5,-1, 4, 6, 3, 2, 8, 7,-2, 5 },        // 102
-        { 1, 1, 1,18, 1, 3, 5, 8, 6, 2, 3,-1, 7, 1, 4, 8, 5,-3 },        // 103
-        { 0, 2, 3,18,18, 2,18,-2, 6, 5, 7, 2, 4,18, 3, 6,-3, 5 },        // 104
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 105
-        { 1, 1, 3, 6,17, 8, 7, 5,18,-1, 1, 2, 3, 4, 2, 6, 8, 1 },        // 106
-        { 0, 4,18, 2,17, 3,18,-2, 2, 6,18, 2, 7, 3, 5, 4, 8,18 },        // 107
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 108
-        { 0, 1,18,18, 2, 3, 6, 3, 5,-2, 2, 4,18, 3,-2,-1, 6, 7 },        // 109
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 110
-        { 1, 1,17, 1, 2, 5, 3,-2, 1, 4, 3, 7, 6,-3, 2, 1, 1, 2 },        // 111
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 112
-        { 1, 1,18,18,-2,18,-2, 2, 3, 6,18, 4,-1, 2, 3, 8, 1, 4 },        // 113
-        { 1, 1,17,-2,17, 2,-3, 1, 5,-1, 4, 6, 3, 2, 8, 7,-2, 5 },        // 114
-        { 0, 1,17,17,18, 3, 2,18,18, 6, 8, 2,-2, 3, 5, 4,17,18 },        // 115
-        { 1, 1, 1, 5, 1, 3, 4, 3, 7, 5, 1, 3, 6, 1, 2, 4, 3, 8 },        // 116
-        { 1, 1, 1, 3,-3,18,18, 6, 5,18, 2,-1, 3, 8, 7,-3, 4,17 },        // 117
-        { 1, 1,18, 1, 2, 1, 3, 8, 7, 4, 1, 5, 2,-1,-3,18, 6, 2 },        // 118
-        { 0, 1,18, 3, 5, 2, 6, 8,18, 5, 7, 2, 3,-1, 6, 7, 8, 5 },        // 119
-        { 0, 2,18, 3,-2, 7, 8, 2, 5, 4,-3, 8, 3, 2,18, 5, 4, 6 },        // 120
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 121
-        { 1, 3, 1, 1, 2, 5, 2, 7, 4, 3,-1,18,-2, 8, 2, 1, 6, 7 },        // 122
-        { 0, 1, 3,17,18, 5, 2, 6, 7,18, 4, 5, 3, 6,18, 2, 7, 8 },        // 123
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 124
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 125
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 126
-        { 0, 1, 1,18, 1, 2, 3, 5, 1, 2, 6, 7, 4, 3, 8, 1,17, 5 },        // 127
-        { 1, 2,17,-1,18,-2, 2, 3, 5,18, 2, 4, 6, 7, 3,-1, 5, 8 },        // 128
-        { 1, 1,18,18,-3,18,-2, 2, 3,-2,18, 6, 4, 5, 8, 3,17,-3 },        // 129
-        { 1, 1,18, 7, 6, 5, 5, 3, 1, 4, 2, 7, 3, 4,-3, 6,18, 8 },        // 130
-        { 0, 2,18,18, 2, 3, 5,18, 2, 4, 3, 6,18, 7, 8,-1, 5, 2 },        // 131
-        { 0, 1,18,17,-1, 2,18, 3, 2,18, 4, 3,18, 2, 6, 5, 8,17 },        // 132
-        { 0, 2,18,17, 2, 3,18, 5,-1, 6, 7, 8, 2, 3, 4, 5,18, 6 },        // 133
-        { 1, 2,18,-3,18, 2, 3,-2,-3, 5,18, 7, 6, 2, 4, 3, 8,-2 },        // 134
-        { 1, 1,17,18,18,-2, 2, 3, 5, 4, 8,18,-1, 5, 3, 6,-2, 7 },        // 135
-        { 1, 2,18,17, 2,-2,18, 3,-1, 4,18, 2, 7, 5, 3, 8, 6, 4 },        // 136
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 137
-        { 1, 1, 1, 5, 1, 3, 4, 3, 7, 5, 1, 3, 6, 1, 2, 4, 3, 8 },        // 138
-        { 0, 2,18,18, 3, 3,-2, 2, 5,18, 6, 3,-1, 4, 7,-1, 1, 2 },        // 139
-        { 0, 1,-2, 1,18, 2,-2, 5, 7,18, 3, 2, 6, 2,-1, 4,-2,17 },        // 140
-        { 0, 2,18,18,18, 2, 3,-2,18, 5, 4, 2, 6, 8, 3,-2, 4,18 },        // 141
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 142
-        { 1, 1,17,18,-1, 3, 2, 5, 1, 3, 2, 8, 4, 7, 6, 2,-1, 5 },        // 143
-        { 1, 1,17,18,18, 4, 2, 3, 7, 6,18, 8, 5,-1, 4, 2, 3,17 },        // 144
-        { 0, 1,18,18,-2,18, 2, 3, 4, 5, 6,18, 8, 2, 3, 7,-2, 4 },        // 145
-        { 0, 1,18,-2,18,18,-3,-2, 2, 3, 5, 8, 1, 2, 6, 4, 7,-1 },        // 146
-        { 0, 1,18,17, 2,18, 3,-2, 2, 7, 6, 4,18, 3, 8, 7, 4, 2 },        // 147
-        { 1, 1,17,18,18, 4, 2, 3, 7, 6,18, 8, 5,-1, 4, 2, 3,17 },        // 148
-        { 1, 1,18,17,18, 2, 5, 3,-2,18, 6, 2, 3, 4, 8, 7, 5,-1 },        // 149
-        { 0, 1, 2,-1,18,-1, 2, 4,-3,18, 5, 3, 6,18, 2, 4, 7, 8 },        // 150
-        { 1, 1,17,18, 8, 3, 6, 4,-1, 5, 2, 7, 3, 8, 6, 5,18, 4 },        // 151
-        { 0, 2,18, 3,-2, 7, 8, 2, 5, 4,-3, 8, 3, 2,18, 5, 4, 6 },        // 152
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 153
-        { 1, 1, 1,18,-1, 8, 2, 6, 3,-2, 1, 2, 5, 4,-3, 8, 6, 3 },        // 154
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 155
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 156
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 157
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 158
-        { 0, 1,17,18,18, 4, 2, 7, 3, 6,-2,18, 8, 4, 5, 2, 7,17 },        // 159
-        { 1, 2,18,-1,18, 3,-2,18, 2, 5, 3, 6, 7, 2,-1,18, 8, 4 },        // 160
-        { 1, 2, 1,18,-2, 4,18, 2, 3, 6,-1, 7, 5,-2,18, 8, 2, 4 },        // 161
-        { 1, 2, 1,18,-3, 2, 3,18,-1, 5, 6, 2, 8, 3, 4, 1,-2, 7 },        // 162
-        { 0, 1, 1,17,-1,18, 3, 2, 5, 4, 6, 7, 8, 3, 4, 2, 1,-2 },        // 163
-        { 1, 1,18,17,18, 4, 3, 5, 1, 2, 6, 3, 4, 7, 1, 8, 5, 2 },        // 164
-        { 0, 1,18,-2, 7, 1, 3, 2,-3, 4, 6,-2, 7, 8, 1, 5, 4, 3 },        // 165
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 166
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 167
-        { 0, 2,18,18,18,-2, 2, 5, 3, 7,18, 2, 4,-3, 5, 6, 3, 8 },        // 168
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 169
-        { 0, 3, 3,18,-1, 5, 2, 7,18, 6, 5, 2, 4, 3,-1, 7,18, 6 },        // 170
-        { 0, 2,18,18,18, 4, 3, 2, 6, 4, 8,18, 5, 3, 2, 7,-2, 6 },        // 171
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 172
-        { 0, 2,18,18,18, 2, 3,-2,18, 5, 4, 2, 6, 8, 3,-2, 4,18 },        // 173
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 174
-        { 1, 1,17, 8,18, 3, 2, 1, 5, 4, 6,-1, 3,-3, 8,18, 7, 2 },        // 175
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 176
-        { 0, 1,-1,18,18,18, 2, 4, 6,-2, 2, 8, 3, 4,18, 7,-1, 6 },        // 177
-        { 0, 1,18, 1,-2, 2, 4, 1, 3,-1, 2, 5, 7, 1, 6, 8,-2,17 },        // 178
-        { 0, 1,17,17,18, 2, 5, 4,18, 3, 8, 7, 4, 6, 8, 1, 5, 2 },        // 179
-        { 1, 2,18,18, 5, 4, 6, 3, 4,18, 8, 4,-1, 7, 5, 3, 6, 2 },        // 180
-        { 0, 1,18,18,-3,18, 3, 6, 2, 5, 7,18, 3, 8,-1, 4, 5, 2 },        // 181
-        { 1, 1,18, 2,-2,-3,18, 5, 2,-2, 4, 3, 6,18, 8,-1, 2, 7 },        // 182
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 183
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 184
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 185
-        { 1, 1,17, 1, 7, 2, 3,18,-2, 3, 6, 4, 2, 7, 8, 5, 3,17 },        // 186
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 187
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 188
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 189
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 190
-        { 0, 1,17,18, 3,18, 2, 5, 4, 7,-3, 6, 3, 2,18, 4, 7, 3 },        // 191
-        { 1, 1, 1, 7, 4, 5, 3, 4, 5, 1, 3, 6, 3, 2, 4, 8,-2, 7 },        // 192
-        { 0, 1, 1,18,-1,-2,18, 3, 2,-1, 6, 7, 4, 5, 3,18, 2,-3 },        // 193
-        { 1, 1,18,18,-1, 3, 6,18, 5, 4, 8, 2, 3, 6,18, 7, 4,-2 },        // 194
-        { 0, 2,18,18, 2, 6,18, 2,18, 5, 3,18, 2, 4, 7, 8, 3,18 },        // 195
-        { 1, 1, 3,18,18, 5,18, 6, 2, 4, 7,-2,18, 5, 8, 6, 3, 2 },        // 196
-        { 0, 1,18,-2, 7, 1, 3, 2,-3, 4, 6,-2, 7, 8, 1, 5, 4, 3 },        // 197
-        { 1, 1,18,-2,18, 2, 5,18, 3,-2, 4, 7, 2,-1, 8, 6, 5, 1 },        // 198
-        { 1, 1,17,17, 5,18, 4, 1, 2, 8, 6, 4,-2, 3, 5,-1, 1, 8 },        // 199
-        { 0, 2, 1, 2,17, 3, 7,18, 2,-1, 4, 5,18, 2, 7, 3, 6, 8 },        // 200
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 201
-        { 1, 1, 3, 6,17, 8, 7, 5,18,-1, 1, 2, 3, 4, 2, 6, 8, 1 },        // 202
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 203
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 204
-        { 0, 2,18,18,18, 2,-2, 3, 6, 4, 8,18, 2, 5, 7, 4, 3, 6 },        // 205
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 206
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 207
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 208
-        { 1, 1,18, 1, 8, 3, 5, 6, 4,-1, 8, 3, 7,18, 2, 5, 8, 4 },        // 209
-        { 1, 1,17,18, 5, 2, 4, 3, 1, 6,-2, 1, 3, 2, 4, 5,-1,17 },        // 210
-        { 1, 1,18,17, 2,18, 3,-3, 7, 2, 6, 4, 3, 5,18, 8, 2,-2 },        // 211
-        { 1, 1,18,17,18, 4, 3, 5,-1,18, 2, 7, 8, 4, 6, 3,18, 5 },        // 212
-        { 0, 1,18,17,18,-2, 2,-3, 3, 4, 8, 5, 2,18, 6, 3, 7,-2 },        // 213
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 214
-        { 1, 1,17,18, 8, 3, 4, 6,18, 5,-2, 3, 8, 5, 2, 4, 7, 6 },        // 215
-        { 0, 1,18,-2, 3, 5, 1, 7, 3, 2, 6,-3, 4, 1, 5, 8, 3,-2 },        // 216
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 217
-        { 1, 1, 3,17,18, 5,-1,18, 2, 6, 7,18, 5, 3,-3,-1, 6, 2 },        // 218
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 219
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 220
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 221
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 222
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 223
-        { 1, 3,18,17,-2, 3,-1,18, 2, 5, 3, 7, 6, 2, 4, 8,18, 5 },        // 224
-        { 0, 1,18,-1,18, 2,18, 3, 5,18, 2, 8,18, 5, 4,-1, 6, 2 },        // 225
-        { 1, 2,18,-2,18,18, 2, 3, 4,-3, 2, 5,18, 7, 4, 3, 8, 6 },        // 226
-        { 0, 2,17,-1,18, 2,-1, 1, 7, 3, 8, 5,-2, 4, 1, 2,-3, 6 },        // 227
-        { 0, 1,18,17, 2,18, 2,18, 6, 7, 4, 3,18, 5, 2,-2,17, 8 },        // 228
-        { 0, 3,18,17, 2, 3,-3,-1,18, 2, 4, 5,18, 7, 3, 2,-3, 6 },        // 229
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 230
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 231
-        { 0, 2, 3,18,18,18, 2, 6, 5,18, 7, 2, 4, 6,18, 5, 3, 8 },        // 232
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 233
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 234
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 235
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 236
-        { 0, 1,18,18, 3, 6, 3,-2, 2,18, 5,-1, 7, 3, 4,-2, 2, 6 },        // 237
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 238
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 239
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 240
-        { 1, 1,18,17,18,18,-2, 2, 3,-3,18, 6, 4, 2,-2, 8, 3, 7 },        // 241
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 242
-        { 0, 1,18,18,18, 4, 2, 7, 8,18, 3, 2,-2, 4, 7, 6,17, 5 },        // 243
-        { 1, 1,18,18,-1,-2, 8, 3,18, 6, 3, 5, 8, 2, 4, 7, 1, 6 },        // 244
-        { 1, 1, 1,-3, 3,18,18, 2,-1, 3, 6, 5,18, 4, 7,-2, 8, 3 },        // 245
-        { 1, 1, 1,18, 4, 2, 5,18, 1, 3,-1, 6, 1, 4, 8, 2, 5, 1 },        // 246
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 247
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 248
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 249
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 250
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 251
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 252
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 253
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 254
-        { 0, 1,-1,18,18, 2,18, 3, 5,18, 2,18, 6, 8, 4, 5, 7,-1 },        // 255
-};
-
-#define NUM_FAST_SPECS (sizeof (fast_specs) / sizeof (fast_specs [0]))
-#define NUM_DEFAULT_SPECS (sizeof (default_specs) / sizeof (default_specs [0]))
-#define NUM_HIGH_SPECS (sizeof (high_specs) / sizeof (high_specs [0]))
-#define NUM_VERY_HIGH_SPECS (sizeof (very_high_specs) / sizeof (very_high_specs [0]))
+#include "wavpack_local.h"
+#include "decorr_tables.h"      // contains data, only include from this module!
 
 ///////////////////////////// executable code ////////////////////////////////
 
@@ -1093,6 +35,10 @@ void pack_init (WavpackContext *wpc)
     CLEAR (wps->decorr_passes);
     CLEAR (wps->dc);
 
+#ifdef SKIP_DECORRELATION
+    wpc->config.xmode = 0;
+#endif
+
     /* although we set the term and delta values here for clarity, they're
      * actually hardcoded in the analysis function for speed
      */
@@ -1103,7 +49,7 @@ void pack_init (WavpackContext *wpc)
 
     if (wpc->config.flags & CONFIG_AUTO_SHAPING) {
         if (wpc->config.flags & CONFIG_OPTIMIZE_WVC)
-            wps->dc.shaping_acc [0] = wps->dc.shaping_acc [1] = -512L << 16;
+            wps->dc.shaping_acc [0] = wps->dc.shaping_acc [1] = -(512L << 16);
         else if (wpc->config.sample_rate >= 64000)
             wps->dc.shaping_acc [0] = wps->dc.shaping_acc [1] = 1024L << 16;
         else
@@ -1154,7 +100,7 @@ void pack_init (WavpackContext *wpc)
 // array into the specified metadata structure. Both the actual term id and
 // the delta are packed into single characters.
 
-void write_decorr_terms (WavpackStream *wps, WavpackMetadata *wpmd)
+static void write_decorr_terms (WavpackStream *wps, WavpackMetadata *wpmd)
 {
     int tcount = wps->num_terms;
     struct decorr_pass *dpp;
@@ -1174,7 +120,7 @@ void write_decorr_terms (WavpackStream *wps, WavpackMetadata *wpmd)
 // range +/-1024, but are rounded and truncated to fit in signed chars for
 // metadata storage. Weights are separate for the two channels
 
-void write_decorr_weights (WavpackStream *wps, WavpackMetadata *wpmd)
+static void write_decorr_weights (WavpackStream *wps, WavpackMetadata *wpmd)
 {
     struct decorr_pass *dpp = wps->decorr_passes;
     int tcount = wps->num_terms, i;
@@ -1215,7 +161,7 @@ void write_decorr_weights (WavpackStream *wps, WavpackMetadata *wpmd)
 // sending more than the first term's samples is a waste. The "wcount"
 // variable can be set to the number of terms to have their samples stored.
 
-void write_decorr_samples (WavpackStream *wps, WavpackMetadata *wpmd)
+static void write_decorr_samples (WavpackStream *wps, WavpackMetadata *wpmd)
 {
     int tcount = wps->num_terms, wcount = 1, temp;
     struct decorr_pass *dpp;
@@ -1227,27 +173,27 @@ void write_decorr_samples (WavpackStream *wps, WavpackMetadata *wpmd)
     for (dpp = wps->decorr_passes; tcount--; ++dpp)
         if (wcount) {
             if (dpp->term > MAX_TERM) {
-                dpp->samples_A [0] = exp2s (temp = log2s (dpp->samples_A [0]));
+                dpp->samples_A [0] = wp_exp2s (temp = wp_log2s (dpp->samples_A [0]));
                 *byteptr++ = temp;
                 *byteptr++ = temp >> 8;
-                dpp->samples_A [1] = exp2s (temp = log2s (dpp->samples_A [1]));
+                dpp->samples_A [1] = wp_exp2s (temp = wp_log2s (dpp->samples_A [1]));
                 *byteptr++ = temp;
                 *byteptr++ = temp >> 8;
 
                 if (!(wps->wphdr.flags & MONO_DATA)) {
-                    dpp->samples_B [0] = exp2s (temp = log2s (dpp->samples_B [0]));
+                    dpp->samples_B [0] = wp_exp2s (temp = wp_log2s (dpp->samples_B [0]));
                     *byteptr++ = temp;
                     *byteptr++ = temp >> 8;
-                    dpp->samples_B [1] = exp2s (temp = log2s (dpp->samples_B [1]));
+                    dpp->samples_B [1] = wp_exp2s (temp = wp_log2s (dpp->samples_B [1]));
                     *byteptr++ = temp;
                     *byteptr++ = temp >> 8;
                 }
             }
             else if (dpp->term < 0) {
-                dpp->samples_A [0] = exp2s (temp = log2s (dpp->samples_A [0]));
+                dpp->samples_A [0] = wp_exp2s (temp = wp_log2s (dpp->samples_A [0]));
                 *byteptr++ = temp;
                 *byteptr++ = temp >> 8;
-                dpp->samples_B [0] = exp2s (temp = log2s (dpp->samples_B [0]));
+                dpp->samples_B [0] = wp_exp2s (temp = wp_log2s (dpp->samples_B [0]));
                 *byteptr++ = temp;
                 *byteptr++ = temp >> 8;
             }
@@ -1255,12 +201,12 @@ void write_decorr_samples (WavpackStream *wps, WavpackMetadata *wpmd)
                 int m = 0, cnt = dpp->term;
 
                 while (cnt--) {
-                    dpp->samples_A [m] = exp2s (temp = log2s (dpp->samples_A [m]));
+                    dpp->samples_A [m] = wp_exp2s (temp = wp_log2s (dpp->samples_A [m]));
                     *byteptr++ = temp;
                     *byteptr++ = temp >> 8;
 
                     if (!(wps->wphdr.flags & MONO_DATA)) {
-                        dpp->samples_B [m] = exp2s (temp = log2s (dpp->samples_B [m]));
+                        dpp->samples_B [m] = wp_exp2s (temp = wp_log2s (dpp->samples_B [m]));
                         *byteptr++ = temp;
                         *byteptr++ = temp >> 8;
                     }
@@ -1285,7 +231,7 @@ void write_decorr_samples (WavpackStream *wps, WavpackMetadata *wpmd)
 // hybrid data. The "delta" parameter is not yet used in encoding as it
 // will be part of the "quality" mode.
 
-void write_shaping_info (WavpackStream *wps, WavpackMetadata *wpmd)
+static void write_shaping_info (WavpackStream *wps, WavpackMetadata *wpmd)
 {
     char *byteptr;
     int temp;
@@ -1293,29 +239,29 @@ void write_shaping_info (WavpackStream *wps, WavpackMetadata *wpmd)
     byteptr = wpmd->data = malloc (12);
     wpmd->id = ID_SHAPING_WEIGHTS;
 
-    wps->dc.error [0] = exp2s (temp = log2s (wps->dc.error [0]));
+    wps->dc.error [0] = wp_exp2s (temp = wp_log2s (wps->dc.error [0]));
     *byteptr++ = temp;
     *byteptr++ = temp >> 8;
-    wps->dc.shaping_acc [0] = exp2s (temp = log2s (wps->dc.shaping_acc [0]));
+    wps->dc.shaping_acc [0] = wp_exp2s (temp = wp_log2s (wps->dc.shaping_acc [0]));
     *byteptr++ = temp;
     *byteptr++ = temp >> 8;
 
     if (!(wps->wphdr.flags & MONO_DATA)) {
-        wps->dc.error [1] = exp2s (temp = log2s (wps->dc.error [1]));
+        wps->dc.error [1] = wp_exp2s (temp = wp_log2s (wps->dc.error [1]));
         *byteptr++ = temp;
         *byteptr++ = temp >> 8;
-        wps->dc.shaping_acc [1] = exp2s (temp = log2s (wps->dc.shaping_acc [1]));
+        wps->dc.shaping_acc [1] = wp_exp2s (temp = wp_log2s (wps->dc.shaping_acc [1]));
         *byteptr++ = temp;
         *byteptr++ = temp >> 8;
     }
 
     if (wps->dc.shaping_delta [0] | wps->dc.shaping_delta [1]) {
-        wps->dc.shaping_delta [0] = exp2s (temp = log2s (wps->dc.shaping_delta [0]));
+        wps->dc.shaping_delta [0] = wp_exp2s (temp = wp_log2s (wps->dc.shaping_delta [0]));
         *byteptr++ = temp;
         *byteptr++ = temp >> 8;
 
         if (!(wps->wphdr.flags & MONO_DATA)) {
-            wps->dc.shaping_delta [1] = exp2s (temp = log2s (wps->dc.shaping_delta [1]));
+            wps->dc.shaping_delta [1] = wp_exp2s (temp = wp_log2s (wps->dc.shaping_delta [1]));
             *byteptr++ = temp;
             *byteptr++ = temp >> 8;
         }
@@ -1329,7 +275,7 @@ void write_shaping_info (WavpackStream *wps, WavpackMetadata *wpmd)
 // than 24 bits of magnitude or, in some cases, it's used to eliminate
 // redundant bits from any audio stream.
 
-void write_int32_info (WavpackStream *wps, WavpackMetadata *wpmd)
+static void write_int32_info (WavpackStream *wps, WavpackMetadata *wpmd)
 {
     char *byteptr;
 
@@ -1342,29 +288,43 @@ void write_int32_info (WavpackStream *wps, WavpackMetadata *wpmd)
     wpmd->byte_length = (int32_t)(byteptr - (char *) wpmd->data);
 }
 
+static void write_float_info (WavpackStream *wps, WavpackMetadata *wpmd)
+{
+    char *byteptr;
+
+    byteptr = wpmd->data = malloc (4);
+    wpmd->id = ID_FLOAT_INFO;
+    *byteptr++ = wps->float_flags;
+    *byteptr++ = wps->float_shift;
+    *byteptr++ = wps->float_max_exp;
+    *byteptr++ = wps->float_norm_exp;
+    wpmd->byte_length = (int32_t)(byteptr - (char *) wpmd->data);
+}
+
 // Allocate room for and copy the multichannel information into the specified
 // metadata structure. The first byte is the total number of channels and the
 // following bytes represent the channel_mask as described for Microsoft
 // WAVEFORMATEX.
 
-void write_channel_info (WavpackContext *wpc, WavpackMetadata *wpmd)
+static void write_channel_info (WavpackContext *wpc, WavpackMetadata *wpmd)
 {
     uint32_t mask = wpc->config.channel_mask;
-    char *byteptr;
+    char *byteptr = wpmd->data = malloc (8);
 
-    if (wpc->num_streams > OLD_MAX_STREAMS) {
-        byteptr = wpmd->data = malloc (6);
-        wpmd->id = ID_CHANNEL_INFO;
-        *byteptr++ = wpc->config.num_channels - 1;
-        *byteptr++ = wpc->num_streams - 1;
+    wpmd->id = ID_CHANNEL_INFO;
+
+    if (wpc->num_streams > OLD_MAX_STREAMS) {       // if > 8 streams, use 6 or 7 bytes (breaks old decoders
+        *byteptr++ = wpc->config.num_channels - 1;  // that could only handle 8 streams) and allow (in theory)
+        *byteptr++ = wpc->num_streams - 1;          // up to 4096 channels
         *byteptr++ = (((wpc->num_streams - 1) >> 4) & 0xf0) | (((wpc->config.num_channels - 1) >> 8) & 0xf);
         *byteptr++ = mask;
         *byteptr++ = (mask >> 8);
         *byteptr++ = (mask >> 16);
+
+        if (mask & 0xff000000)                      // this will break versions < 5.0, but is RF64-specific
+            *byteptr++ = (mask >> 24);
     }
-    else {
-        byteptr = wpmd->data = malloc (4);
-        wpmd->id = ID_CHANNEL_INFO;
+    else {                                          // otherwise use only 1 to 5 bytes
         *byteptr++ = wpc->config.num_channels;
 
         while (mask) {
@@ -1376,17 +336,30 @@ void write_channel_info (WavpackContext *wpc, WavpackMetadata *wpmd)
     wpmd->byte_length = (int32_t)(byteptr - (char *) wpmd->data);
 }
 
+// Allocate room for and copy the multichannel identities into the specified
+// metadata structure. Data is an array of unsigned characters representing
+// any channels in the file that DO NOT match one the 18 Microsoft standard
+// channels (and are represented in the channel mask). A value of 0 is not
+// allowed and 0xff means an unknown or undefined channel identity.
+
+static void write_channel_identities_info (WavpackContext *wpc, WavpackMetadata *wpmd)
+{
+    wpmd->byte_length = (int) strlen ((char *) wpc->channel_identities);
+    wpmd->data = strdup ((char *) wpc->channel_identities);
+    wpmd->id = ID_CHANNEL_IDENTITIES;
+}
+
 // Allocate room for and copy the configuration information into the specified
 // metadata structure. Currently, we just store the upper 3 bytes of
 // config.flags and only in the first block of audio data. Note that this is
 // for informational purposes not required for playback or decoding (like
 // whether high or fast mode was specified).
 
-void write_config_info (WavpackContext *wpc, WavpackMetadata *wpmd)
+static void write_config_info (WavpackContext *wpc, WavpackMetadata *wpmd)
 {
     char *byteptr;
 
-    byteptr = wpmd->data = malloc (4);
+    byteptr = wpmd->data = malloc (8);
     wpmd->id = ID_CONFIG_BLOCK;
     *byteptr++ = (char) (wpc->config.flags >> 8);
     *byteptr++ = (char) (wpc->config.flags >> 16);
@@ -1395,16 +368,64 @@ void write_config_info (WavpackContext *wpc, WavpackMetadata *wpmd)
     if (wpc->config.flags & CONFIG_EXTRA_MODE)
         *byteptr++ = (char) wpc->config.xmode;
 
+    // for the 5.0.0 alpha, we wrote the qmode flags here, but this
+    // has been replaced with the new_config block
+    // *byteptr++ = (char) wpc->config.qmode;
+
     wpmd->byte_length = (int32_t)(byteptr - (char *) wpmd->data);
 }
 
-// Allocate room for and copy the non-standard sampling rateinto the specified
-// metadata structure. We just store the lower 3 bytes of the sampling rate.
-// Note that this would only be used when the sampling rate was not included
-// in the table of 15 "standard" values.
+// Allocate room for and copy the "new" configuration information into the
+// specified metadata structure. This is all the stuff introduced with version
+// 5.0 and includes the qmode flags (big-endian, etc.) and CAF extended
+// channel layouts (including optional reordering). Even if there is no new
+// configuration, we still send the empty metadata block to signal a 5.0 file.
 
-void write_sample_rate (WavpackContext *wpc, WavpackMetadata *wpmd)
+static void write_new_config_info (WavpackContext *wpc, WavpackMetadata *wpmd)
+{
+    char *byteptr = wpmd->data = malloc (260);
 
+    wpmd->id = ID_NEW_CONFIG_BLOCK;
+
+    if (wpc->file_format || (wpc->config.qmode & 0xff) || wpc->channel_layout) {
+        *byteptr++ = (char) wpc->file_format;
+        *byteptr++ = (char) wpc->config.qmode;
+
+        if (wpc->channel_layout) {
+            int nchans = wpc->channel_layout & 0xff;
+
+            *byteptr++ = (char) ((wpc->channel_layout & 0xff0000) >> 16);
+
+            if (wpc->channel_reordering || nchans != wpc->config.num_channels)
+                *byteptr++ = (char) nchans;
+
+            if (wpc->channel_reordering) {
+                int i, num_to_send = 0;
+
+                // to save space, don't send redundant reorder string bytes
+
+                for (i = 0; i < nchans; ++i)
+                    if (wpc->channel_reordering [i] != i)
+                        num_to_send = i + 1;
+
+                if (num_to_send) {
+                    memcpy (byteptr, wpc->channel_reordering, num_to_send);
+                    byteptr += num_to_send;
+                }
+            }
+        }
+    }
+
+    wpmd->byte_length = (int32_t)(byteptr - (char *) wpmd->data);
+}
+
+// Allocate room for and copy the non-standard sampling rate into the specified
+// metadata structure. We normally store the lower 3 bytes of the sampling rate,
+// unless 4 bytes are required (introduced in version 5). Note that this would
+// only be used when the sampling rate was not included in the table of 15
+// "standard" values.
+
+static void write_sample_rate (WavpackContext *wpc, WavpackMetadata *wpmd)
 {
     char *byteptr;
 
@@ -1413,6 +434,12 @@ void write_sample_rate (WavpackContext *wpc, WavpackMetadata *wpmd)
     *byteptr++ = (char) (wpc->config.sample_rate);
     *byteptr++ = (char) (wpc->config.sample_rate >> 8);
     *byteptr++ = (char) (wpc->config.sample_rate >> 16);
+
+    // handle 4-byte sampling rates for scientific applications, etc.
+
+    if (wpc->config.sample_rate & 0x7f000000)
+        *byteptr++ = (char) (wpc->config.sample_rate >> 24) & 0x7f;
+
     wpmd->byte_length = (int32_t)(byteptr - (char *) wpmd->data);
 }
 
@@ -1425,13 +452,13 @@ void write_sample_rate (WavpackContext *wpc, WavpackMetadata *wpmd)
 // "wps->blockend" points to the end of the available space. A return value of
 // FALSE indicates an error.
 
-static void best_floating_line (short *values, int num_values, double *initial_y, double *final_y, short *max_error);
-static void dynamic_noise_shaping (WavpackContext *wpc, int32_t *buffer, int shortening_allowed);
 static int scan_int32_data (WavpackStream *wps, int32_t *values, int32_t num_values);
 static void scan_int32_quick (WavpackStream *wps, int32_t *values, int32_t num_values);
 static void send_int32_data (WavpackStream *wps, int32_t *values, int32_t num_values);
 static int scan_redundancy (int32_t *values, int32_t num_values);
 static int pack_samples (WavpackContext *wpc, int32_t *buffer);
+static void bs_open_write (Bitstream *bs, void *buffer_start, void *buffer_end);
+static uint32_t bs_close_write (Bitstream *bs);
 
 int pack_block (WavpackContext *wpc, int32_t *buffer)
 {
@@ -1440,6 +467,13 @@ int pack_block (WavpackContext *wpc, int32_t *buffer)
     int32_t sample_count = wps->wphdr.block_samples, *orig_data = NULL;
     int dynamic_shaping_done = FALSE;
 
+    // This is done first because this code can potentially change the size of the block about to
+    // be encoded. This can happen because the dynamic noise shaping algorithm wants to send a
+    // shorter block because the desired noise-shaping profile is changing quickly. It can also
+    // be that the --merge-blocks feature wants to create a longer block because it combines areas
+    // with equal redundancy. These are not applicable for anything besides the first stream of
+    // the file and they are not applicable with float data or >24-bit data.
+
     if (!wpc->current_stream && !(flags & FLOAT_DATA) && (flags & MAG_MASK) >> MAG_LSB < 24) {
         if ((wpc->config.flags & CONFIG_DYNAMIC_SHAPING) && !wpc->config.block_samples) {
             dynamic_noise_shaping (wpc, buffer, TRUE);
@@ -1459,7 +493,10 @@ int pack_block (WavpackContext *wpc, int32_t *buffer)
         }
     }
 
-    if (!(flags & MONO_FLAG) && wpc->stream_version >= 0x410) {
+    // This code scans stereo data to check whether it can be stored as mono data
+    // (i.e., all L/R samples identical). Only available with MAX_STREAM_VERS.
+
+    if (!(flags & MONO_FLAG) && wpc->stream_version == MAX_STREAM_VERS) {
         int32_t lor = 0, diff = 0;
         int32_t *sptr, *dptr, i;
 
@@ -1493,6 +530,9 @@ int pack_block (WavpackContext *wpc, int32_t *buffer)
         }
     }
 
+    // This is where we handle any fixed shift which occurs when the integer size does not evenly fit
+    // in bytes (like 12-bit or 20-bit) and is the same for the entire file (not based on scanning)
+
     if (flags & SHIFT_MASK) {
         int shift = (flags & SHIFT_MASK) >> SHIFT_LSB;
         int mag = (flags & MAG_MASK) >> MAG_LSB;
@@ -1516,12 +556,23 @@ int pack_block (WavpackContext *wpc, int32_t *buffer)
         wps->wphdr.flags = flags;
     }
 
-    if ((flags & FLOAT_DATA) || (flags & MAG_MASK) >> MAG_LSB >= 24) {
+    // The regular WavPack decorrelation and entropy encoding can handle up to 24-bit integer data. If
+    // we have float data or integers larger than 24-bit, then we have to potentially do extra processing.
+    // For lossy encoding, we can simply convert this data in-place to 24-bit data and encode and sent
+    // that, along with some metadata about how to restore the original format (even if the restoration
+    // is not exact). However, for lossless operation we must make a copy of the original data that will
+    // be used to create a "extension stream" that will allow verbatim restoration of the original data.
+    // In the hybrid mode that extension goes in the correction file, otherwise it goes in the mail file.
+
+    if ((flags & FLOAT_DATA) || (flags & MAG_MASK) >> MAG_LSB >= 24) {      // if float data or >24-bit integers...
+
+        // if lossless we have to copy the data to use later...
+
         if ((!(flags & HYBRID_FLAG) || wpc->wvc_flag) && !(wpc->config.flags & CONFIG_SKIP_WVX)) {
             orig_data = malloc (sizeof (f32) * ((flags & MONO_DATA) ? sample_count : sample_count * 2));
             memcpy (orig_data, buffer, sizeof (f32) * ((flags & MONO_DATA) ? sample_count : sample_count * 2));
 
-            if (flags & FLOAT_DATA) {
+            if (flags & FLOAT_DATA) {                                       // if lossless float data come here
                 wps->float_norm_exp = wpc->config.float_norm_exp;
 
                 if (!scan_float_data (wps, (f32 *) buffer, (flags & MONO_DATA) ? sample_count : sample_count * 2)) {
@@ -1529,14 +580,14 @@ int pack_block (WavpackContext *wpc, int32_t *buffer)
                     orig_data = NULL;
                 }
             }
-            else {
+            else {                                                          // otherwise lossless > 24-bit integers
                 if (!scan_int32_data (wps, buffer, (flags & MONO_DATA) ? sample_count : sample_count * 2)) {
                     free (orig_data);
                     orig_data = NULL;
                 }
             }
         }
-        else {
+        else {                                                              // otherwise, we're lossy, so no copy
             if (flags & FLOAT_DATA) {
                 wps->float_norm_exp = wpc->config.float_norm_exp;
 
@@ -1547,20 +598,30 @@ int pack_block (WavpackContext *wpc, int32_t *buffer)
                 wpc->lossy_blocks = TRUE;
         }
 
+        // if there's any chance of magnitude change, clear the noise-shaping error term
+        // and also reset the entropy encoder (which this does)
+
+        wps->dc.error [0] = wps->dc.error [1] = 0;
         wps->num_terms = 0;
     }
+    // if 24-bit integers or less we do a "quick" scan which just scans for redundancy and does NOT set the flag's "magnitude" value
     else {
         scan_int32_quick (wps, buffer, (flags & MONO_DATA) ? sample_count : sample_count * 2);
 
-        if (wps->shift != wps->int32_zeros + wps->int32_ones + wps->int32_dups) {
+        if (wps->shift != wps->int32_zeros + wps->int32_ones + wps->int32_dups) {   // detect a change in any redundancy shifting here
             wps->shift = wps->int32_zeros + wps->int32_ones + wps->int32_dups;
-            wps->num_terms = 0;
+            wps->dc.error [0] = wps->dc.error [1] = 0;                              // on a change, clear the noise-shaping error term and
+            wps->num_terms = 0;                                                     // also reset the entropy encoder (which this does)
         }
     }
 
-    if ((wpc->config.flags & CONFIG_DYNAMIC_SHAPING) && !dynamic_shaping_done)
+    if ((wpc->config.flags & CONFIG_DYNAMIC_SHAPING) && !dynamic_shaping_done)      // calculate dynamic noise profile
         dynamic_noise_shaping (wpc, buffer, FALSE);
 
+    // In some cases we need to start the decorrelation and entropy encoding from scratch. This
+    // could be because we switched from stereo to mono encoding or because the magnitude of
+    // the data changed, or just because this is the first block.
+
     if (!wps->num_passes && !wps->num_terms) {
         wps->num_passes = 1;
 
@@ -1572,6 +633,8 @@ int pack_block (WavpackContext *wpc, int32_t *buffer)
         wps->num_passes = 0;
     }
 
+    // actually pack the block here and return on an error (which pretty much can only be a block buffer overrun)
+
     if (!pack_samples (wpc, buffer)) {
         wps->wphdr.flags = sflags;
 
@@ -1583,6 +646,8 @@ int pack_block (WavpackContext *wpc, int32_t *buffer)
     else
         wps->wphdr.flags = sflags;
 
+    // potentially move any unused dynamic noise shaping profile data to use next time
+
     if (wps->dc.shaping_data) {
         if (wps->dc.shaping_samples != sample_count)
             memmove (wps->dc.shaping_data, wps->dc.shaping_data + sample_count,
@@ -1591,6 +656,10 @@ int pack_block (WavpackContext *wpc, int32_t *buffer)
         wps->dc.shaping_samples -= sample_count;
     }
 
+    // finally, if we're doing lossless float data or lossless >24-bit integers, this is where we take the
+    // original data that we saved earlier and create the "extension" stream containing the information
+    // required to refine the "lossy" 24-bit data into the lossless original
+
     if (orig_data) {
         uint32_t data_count;
         unsigned char *cptr;
@@ -1634,133 +703,6 @@ int pack_block (WavpackContext *wpc, int32_t *buffer)
     return TRUE;
 }
 
-static void dynamic_noise_shaping (WavpackContext *wpc, int32_t *buffer, int shortening_allowed)
-{
-    WavpackStream *wps = wpc->streams [wpc->current_stream];
-    int32_t sample_count = wps->wphdr.block_samples;
-    struct decorr_pass *ap = &wps->analysis_pass;
-    uint32_t flags = wps->wphdr.flags;
-    int32_t *bptr, temp, sam;
-    short *swptr;
-    int sc;
-
-    if (!wps->num_terms && sample_count > 8) {
-        if (flags & MONO_DATA)
-            for (bptr = buffer + sample_count - 3, sc = sample_count - 2; sc--;) {
-                sam = (3 * bptr [1] - bptr [2]) >> 1;
-                temp = *bptr-- - apply_weight (ap->weight_A, sam);
-                update_weight (ap->weight_A, 2, sam, temp);
-            }
-        else
-            for (bptr = buffer + (sample_count - 3) * 2 + 1, sc = sample_count - 2; sc--;) {
-                sam = (3 * bptr [2] - bptr [4]) >> 1;
-                temp = *bptr-- - apply_weight (ap->weight_B, sam);
-                update_weight (ap->weight_B, 2, sam, temp);
-                sam = (3 * bptr [2] - bptr [4]) >> 1;
-                temp = *bptr-- - apply_weight (ap->weight_A, sam);
-                update_weight (ap->weight_A, 2, sam, temp);
-            }
-    }
-
-    if (sample_count > wps->dc.shaping_samples) {
-        sc = sample_count - wps->dc.shaping_samples;
-        swptr = wps->dc.shaping_data + wps->dc.shaping_samples;
-        bptr = buffer + wps->dc.shaping_samples * ((flags & MONO_DATA) ? 1 : 2);
-
-        if (flags & MONO_DATA)
-            while (sc--) {
-                sam = (3 * ap->samples_A [0] - ap->samples_A [1]) >> 1;
-                temp = *bptr - apply_weight (ap->weight_A, sam);
-                update_weight (ap->weight_A, 2, sam, temp);
-                ap->samples_A [1] = ap->samples_A [0];
-                ap->samples_A [0] = *bptr++;
-                *swptr++ = (ap->weight_A < 256) ? 1024 : 1536 - ap->weight_A * 2;
-            }
-        else
-            while (sc--) {
-                sam = (3 * ap->samples_A [0] - ap->samples_A [1]) >> 1;
-                temp = *bptr - apply_weight (ap->weight_A, sam);
-                update_weight (ap->weight_A, 2, sam, temp);
-                ap->samples_A [1] = ap->samples_A [0];
-                ap->samples_A [0] = *bptr++;
-
-                sam = (3 * ap->samples_B [0] - ap->samples_B [1]) >> 1;
-                temp = *bptr - apply_weight (ap->weight_B, sam);
-                update_weight (ap->weight_B, 2, sam, temp);
-                ap->samples_B [1] = ap->samples_B [0];
-                ap->samples_B [0] = *bptr++;
-
-                *swptr++ = (ap->weight_A + ap->weight_B < 512) ? 1024 : 1536 - ap->weight_A - ap->weight_B;
-            }
-
-        wps->dc.shaping_samples = sample_count;
-    }
-
-    if (wpc->wvc_flag) {
-        int max_allowed_error = 1000000 / wpc->ave_block_samples;
-        short max_error, trial_max_error;
-        double initial_y, final_y;
-
-        if (max_allowed_error < 128)
-            max_allowed_error = 128;
-
-        best_floating_line (wps->dc.shaping_data, sample_count, &initial_y, &final_y, &max_error);
-
-        if (shortening_allowed && max_error > max_allowed_error) {
-            int min_samples = 0, max_samples = sample_count, trial_count;
-            double trial_initial_y, trial_final_y;
-
-            while (1) {
-                trial_count = (min_samples + max_samples) / 2;
-
-                best_floating_line (wps->dc.shaping_data, trial_count, &trial_initial_y,
-                    &trial_final_y, &trial_max_error);
-
-                if (trial_max_error < max_allowed_error) {
-                    max_error = trial_max_error;
-                    min_samples = trial_count;
-                    initial_y = trial_initial_y;
-                    final_y = trial_final_y;
-                }
-                else
-                    max_samples = trial_count;
-
-                if (min_samples > 10000 || max_samples - min_samples < 2)
-                    break;
-            }
-
-            sample_count = min_samples;
-        }
-
-        if (initial_y < -512) initial_y = -512;
-        else if (initial_y > 1024) initial_y = 1024;
-
-        if (final_y < -512) final_y = -512;
-        else if (final_y > 1024) final_y = 1024;
-#if 0
-        error_line ("%.2f sec, sample count = %5d, max error = %3d, range = %5d, %5d, actual = %5d, %5d",
-            (double) wps->sample_index / wpc->config.sample_rate, sample_count, max_error,
-            (int) floor (initial_y), (int) floor (final_y),
-            wps->dc.shaping_data [0], wps->dc.shaping_data [sample_count-1]);
-#endif
-        if (sample_count != wps->wphdr.block_samples)
-            wps->wphdr.block_samples = sample_count;
-
-        if (wpc->wvc_flag) {
-            wps->dc.shaping_acc [0] = wps->dc.shaping_acc [1] = (int32_t) floor (initial_y * 65536.0 + 0.5);
-
-            wps->dc.shaping_delta [0] = wps->dc.shaping_delta [1] =
-                (int32_t) floor ((final_y - initial_y) / (sample_count - 1) * 65536.0 + 0.5);
-
-            wps->dc.shaping_array = NULL;
-        }
-        else
-            wps->dc.shaping_array = wps->dc.shaping_data;
-    }
-    else
-        wps->dc.shaping_array = wps->dc.shaping_data;
-}
-
 // Quickly scan a buffer of long integer data and determine whether any
 // redundancy in the LSBs can be used to reduce the data's magnitude. If yes,
 // then the INT32_DATA flag is set and the int32 parameters are set. This
@@ -1957,6 +899,45 @@ static void send_int32_data (WavpackStream *wps, int32_t *values, int32_t num_va
         }
 }
 
+void send_general_metadata (WavpackContext *wpc)
+{
+    WavpackStream *wps = wpc->streams [wpc->current_stream];
+    uint32_t flags = wps->wphdr.flags;
+    WavpackMetadata wpmd;
+
+    if ((flags & SRATE_MASK) == SRATE_MASK && wpc->config.sample_rate != 44100) {
+        write_sample_rate (wpc, &wpmd);
+        copy_metadata (&wpmd, wps->blockbuff, wps->blockend);
+        free_metadata (&wpmd);
+    }
+
+    if ((flags & INITIAL_BLOCK) &&
+        (wpc->config.num_channels > 2 ||
+        wpc->config.channel_mask != 0x5 - wpc->config.num_channels)) {
+            write_channel_info (wpc, &wpmd);
+            copy_metadata (&wpmd, wps->blockbuff, wps->blockend);
+            free_metadata (&wpmd);
+
+            if (wpc->channel_identities) {
+                write_channel_identities_info (wpc, &wpmd);
+                copy_metadata (&wpmd, wps->blockbuff, wps->blockend);
+                free_metadata (&wpmd);
+            }
+    }
+
+    if ((flags & INITIAL_BLOCK) && !wps->sample_index) {
+        write_config_info (wpc, &wpmd);
+        copy_metadata (&wpmd, wps->blockbuff, wps->blockend);
+        free_metadata (&wpmd);
+    }
+
+    if (flags & INITIAL_BLOCK) {
+        write_new_config_info (wpc, &wpmd);
+        copy_metadata (&wpmd, wps->blockbuff, wps->blockend);
+        free_metadata (&wpmd);
+    }
+}
+
 // Pack an entire block of samples (either mono or stereo) into a completed
 // WavPack block. It is assumed that there is sufficient space for the
 // completed block at "wps->blockbuff" and that "wps->blockend" points to the
@@ -1968,20 +949,55 @@ static void send_int32_data (WavpackStream *wps, int32_t *values, int32_t num_va
 // the caller must look at the ckSize field of the written WavpackHeader, NOT
 // the one in the WavpackStream.
 
-static void decorr_stereo_pass (struct decorr_pass *dpp, int32_t *buffer, int32_t sample_count);
-static void decorr_stereo_pass_id2 (struct decorr_pass *dpp, int32_t *buffer, int32_t sample_count);
+#ifdef OPT_ASM_X86
+    #define DECORR_STEREO_PASS(a,b,c) do {              \
+        if (pack_cpu_has_feature_x86 (CPU_FEATURE_MMX)) \
+            pack_decorr_stereo_pass_x86 (a, b, c);      \
+        else decorr_stereo_pass (a, b, c); } while (0)
+    #define DECORR_MONO_BUFFER pack_decorr_mono_buffer_x86
+    #define SCAN_MAX_MAGNITUDE(a,b)                     \
+        (pack_cpu_has_feature_x86 (CPU_FEATURE_MMX) ?   \
+            scan_max_magnitude_x86 (a, b) :             \
+            scan_max_magnitude (a, b))
+#elif defined(OPT_ASM_X64) && (defined (_WIN64) || defined(__CYGWIN__) || defined(__MINGW64__))
+    #define DECORR_STEREO_PASS pack_decorr_stereo_pass_x64win
+    #define DECORR_MONO_BUFFER pack_decorr_mono_buffer_x64win
+    #define SCAN_MAX_MAGNITUDE scan_max_magnitude_x64win
+#elif defined(OPT_ASM_X64)
+    #define DECORR_STEREO_PASS pack_decorr_stereo_pass_x64
+    #define DECORR_MONO_BUFFER pack_decorr_mono_buffer_x64
+    #define SCAN_MAX_MAGNITUDE scan_max_magnitude_x64
+#else
+    #define DECORR_STEREO_PASS decorr_stereo_pass
+    #define DECORR_MONO_BUFFER decorr_mono_buffer
+    #define SCAN_MAX_MAGNITUDE scan_max_magnitude
+#endif
+
+uint32_t DECORR_MONO_BUFFER (int32_t *buffer, struct decorr_pass *decorr_passes, int32_t num_terms, int32_t sample_count);
+
+#ifdef OPT_ASM_X86
+void decorr_stereo_pass (struct decorr_pass *dpp, int32_t *buffer, int32_t sample_count);
+void pack_decorr_stereo_pass_x86 (struct decorr_pass *dpp, int32_t *buffer, int32_t sample_count);
+uint32_t scan_max_magnitude (int32_t *values, int32_t num_values);
+uint32_t scan_max_magnitude_x86 (int32_t *values, int32_t num_values);
+#else
+void DECORR_STEREO_PASS (struct decorr_pass *dpp, int32_t *buffer, int32_t sample_count);
+uint32_t SCAN_MAX_MAGNITUDE (int32_t *values, int32_t num_values);
+#endif
+
+// This macro controls the "repack" function where a block of samples will be repacked with
+// fewer terms if a single residual exceeds the specified magnitude threshold.
+
+#define REPACK_SAFE_NUM_TERMS 5                 // 5 terms is always okay (and we truncate to this)
 
 static int pack_samples (WavpackContext *wpc, int32_t *buffer)
 {
-    WavpackStream *wps = wpc->streams [wpc->current_stream];
-    uint32_t flags = wps->wphdr.flags, data_count, crc, crc2, i;
-    uint32_t sample_count = wps->wphdr.block_samples;
-    short *shaping_array = wps->dc.shaping_array;
-    int tcount, lossy = FALSE, m = 0;
-    double noise_acc = 0.0, noise;
+    WavpackStream *wps = wpc->streams [wpc->current_stream], saved_stream;
+    uint32_t flags = wps->wphdr.flags, repack_possible, data_count, crc, crc2, i;
+    uint32_t sample_count = wps->wphdr.block_samples, repack_mask;
+    int32_t *bptr, *saved_buffer = NULL;
     struct decorr_pass *dpp;
     WavpackMetadata wpmd;
-    int32_t *bptr;
 
     crc = crc2 = 0xffffffff;
 
@@ -2036,794 +1052,436 @@ static int pack_samples (WavpackContext *wpc, int32_t *buffer)
     if (!sample_count)
         return TRUE;
 
-    write_decorr_terms (wps, &wpmd);
-    copy_metadata (&wpmd, wps->blockbuff, wps->blockend);
-    free_metadata (&wpmd);
+    memcpy (&wps->wphdr, wps->blockbuff, sizeof (WavpackHeader));
+    repack_possible = !wps->num_passes && wps->num_terms > REPACK_SAFE_NUM_TERMS;
+    repack_mask = (flags & MAG_MASK) >> MAG_LSB >= 16 ? 0xF0000000 : 0xFFF00000;
+    saved_stream = *wps;
 
-    write_decorr_weights (wps, &wpmd);
-    copy_metadata (&wpmd, wps->blockbuff, wps->blockend);
-    free_metadata (&wpmd);
-
-    write_decorr_samples (wps, &wpmd);
-    copy_metadata (&wpmd, wps->blockbuff, wps->blockend);
-    free_metadata (&wpmd);
-
-    write_entropy_vars (wps, &wpmd);
-    copy_metadata (&wpmd, wps->blockbuff, wps->blockend);
-    free_metadata (&wpmd);
-
-    if ((flags & SRATE_MASK) == SRATE_MASK && wpc->config.sample_rate != 44100) {
-        write_sample_rate (wpc, &wpmd);
-        copy_metadata (&wpmd, wps->blockbuff, wps->blockend);
-        free_metadata (&wpmd);
+    if (repack_possible && !(flags & HYBRID_FLAG)) {
+        saved_buffer = malloc (sample_count * sizeof (int32_t) * (flags & MONO_DATA ? 1 : 2));
+        memcpy (saved_buffer, buffer, sample_count * sizeof (int32_t) * (flags & MONO_DATA ? 1 : 2));
     }
 
-    if (flags & HYBRID_FLAG) {
-        write_hybrid_profile (wps, &wpmd);
+    // This code is written as a loop, but in the overwhelming majority of cases it executes only once.
+    // If one of the higher modes is being used and a residual exceeds a certain threshold, then the
+    // block will be repacked using fewer decorrelation terms. Note that this has only been triggered
+    // by pathological audio samples designed to trigger it...in practice this might never happen. Note
+    // that this only applies to the "high" and "very high" modes and only when packing directly
+    // (i.e. without the "extra" modes that will have already checked magnitude).
+
+    do {
+        short *shaping_array = wps->dc.shaping_array;
+        int tcount, lossy = FALSE, m = 0;
+        double noise_acc = 0.0, noise;
+        uint32_t max_magnitude = 0;
+
+        write_decorr_terms (wps, &wpmd);
         copy_metadata (&wpmd, wps->blockbuff, wps->blockend);
         free_metadata (&wpmd);
-    }
 
-    if (flags & FLOAT_DATA) {
-        write_float_info (wps, &wpmd);
+        write_decorr_weights (wps, &wpmd);
         copy_metadata (&wpmd, wps->blockbuff, wps->blockend);
         free_metadata (&wpmd);
-    }
 
-    if (flags & INT32_DATA) {
-        write_int32_info (wps, &wpmd);
+        write_decorr_samples (wps, &wpmd);
         copy_metadata (&wpmd, wps->blockbuff, wps->blockend);
         free_metadata (&wpmd);
-    }
 
-    if ((flags & INITIAL_BLOCK) &&
-        (wpc->config.num_channels > 2 ||
-        wpc->config.channel_mask != 0x5 - wpc->config.num_channels)) {
-            write_channel_info (wpc, &wpmd);
+        write_entropy_vars (wps, &wpmd);
+        copy_metadata (&wpmd, wps->blockbuff, wps->blockend);
+        free_metadata (&wpmd);
+
+        if (flags & HYBRID_FLAG) {
+            write_hybrid_profile (wps, &wpmd);
             copy_metadata (&wpmd, wps->blockbuff, wps->blockend);
             free_metadata (&wpmd);
-    }
+        }
 
-    if ((flags & INITIAL_BLOCK) && !wps->sample_index) {
-        write_config_info (wpc, &wpmd);
-        copy_metadata (&wpmd, wps->blockbuff, wps->blockend);
-        free_metadata (&wpmd);
-    }
-
-    bs_open_write (&wps->wvbits, wps->blockbuff + ((WavpackHeader *) wps->blockbuff)->ckSize + 12, wps->blockend);
-
-    if (wpc->wvc_flag) {
-        wps->wphdr.ckSize = sizeof (WavpackHeader) - 8;
-        memcpy (wps->block2buff, &wps->wphdr, sizeof (WavpackHeader));
-
-        if (flags & HYBRID_SHAPE) {
-            write_shaping_info (wps, &wpmd);
-            copy_metadata (&wpmd, wps->block2buff, wps->block2end);
+        if (flags & FLOAT_DATA) {
+            write_float_info (wps, &wpmd);
+            copy_metadata (&wpmd, wps->blockbuff, wps->blockend);
             free_metadata (&wpmd);
         }
 
-        bs_open_write (&wps->wvcbits, wps->block2buff + ((WavpackHeader *) wps->block2buff)->ckSize + 12, wps->block2end);
-    }
+        if (flags & INT32_DATA) {
+            write_int32_info (wps, &wpmd);
+            copy_metadata (&wpmd, wps->blockbuff, wps->blockend);
+            free_metadata (&wpmd);
+        }
 
-    /////////////////////// handle lossless mono mode /////////////////////////
+        send_general_metadata (wpc);
+        bs_open_write (&wps->wvbits, wps->blockbuff + ((WavpackHeader *) wps->blockbuff)->ckSize + 12, wps->blockend);
 
-    if (!(flags & HYBRID_FLAG) && (flags & MONO_DATA)) {
-        if (!wps->num_passes)
+        if (wpc->wvc_flag) {
+            wps->wphdr.ckSize = sizeof (WavpackHeader) - 8;
+            memcpy (wps->block2buff, &wps->wphdr, sizeof (WavpackHeader));
+
+            if (flags & HYBRID_SHAPE) {
+                write_shaping_info (wps, &wpmd);
+                copy_metadata (&wpmd, wps->block2buff, wps->block2end);
+                free_metadata (&wpmd);
+            }
+
+            bs_open_write (&wps->wvcbits, wps->block2buff + ((WavpackHeader *) wps->block2buff)->ckSize + 12, wps->block2end);
+        }
+
+        /////////////////////// handle lossless mono mode /////////////////////////
+
+        if (!(flags & HYBRID_FLAG) && (flags & MONO_DATA)) {
+            if (!wps->num_passes) {
+                max_magnitude = DECORR_MONO_BUFFER (buffer, wps->decorr_passes, wps->num_terms, sample_count);
+                m = sample_count & (MAX_TERM - 1);
+            }
+
+            send_words_lossless (wps, buffer, sample_count);
+        }
+
+        //////////////////// handle the lossless stereo mode //////////////////////
+
+        else if (!(flags & HYBRID_FLAG) && !(flags & MONO_DATA)) {
+            if (!wps->num_passes) {
+                if (flags & JOINT_STEREO) {
+                    int32_t *eptr = buffer + (sample_count * 2);
+
+                    for (bptr = buffer; bptr < eptr; bptr += 2)
+                        bptr [1] += ((bptr [0] -= bptr [1]) >> 1);
+                }
+
+                for (tcount = wps->num_terms, dpp = wps->decorr_passes; tcount-- ; dpp++)
+                    DECORR_STEREO_PASS (dpp, buffer, sample_count);
+
+                m = sample_count & (MAX_TERM - 1);
+
+                if (repack_possible)
+                    max_magnitude = SCAN_MAX_MAGNITUDE (buffer, sample_count * 2);
+            }
+
+            send_words_lossless (wps, buffer, sample_count);
+        }
+
+        /////////////////// handle the lossy/hybrid mono mode /////////////////////
+
+        else if ((flags & HYBRID_FLAG) && (flags & MONO_DATA))
             for (bptr = buffer, i = 0; i < sample_count; ++i) {
-                int32_t code = *bptr;
+                int32_t code, temp;
+                int shaping_weight;
 
-                for (tcount = wps->num_terms, dpp = wps->decorr_passes; tcount--; dpp++) {
-                    int32_t sam;
+                crc2 += (crc2 << 1) + (code = *bptr++);
 
+                if (flags & HYBRID_SHAPE) {
+                    if (shaping_array)
+                        shaping_weight = *shaping_array++;
+                    else
+                        shaping_weight = (wps->dc.shaping_acc [0] += wps->dc.shaping_delta [0]) >> 16;
+
+                    temp = -apply_weight (shaping_weight, wps->dc.error [0]);
+
+                    if ((flags & NEW_SHAPING) && shaping_weight < 0 && temp) {
+                        if (temp == wps->dc.error [0])
+                            temp = (temp < 0) ? temp + 1 : temp - 1;
+
+                        wps->dc.error [0] = -code;
+                        code += temp;
+                    }
+                    else
+                        wps->dc.error [0] = -(code += temp);
+                }
+
+                for (tcount = wps->num_terms, dpp = wps->decorr_passes; tcount-- ; dpp++)
                     if (dpp->term > MAX_TERM) {
                         if (dpp->term & 1)
-                            sam = 2 * dpp->samples_A [0] - dpp->samples_A [1];
+                            dpp->samples_A [2] = 2 * dpp->samples_A [0] - dpp->samples_A [1];
                         else
-                            sam = (3 * dpp->samples_A [0] - dpp->samples_A [1]) >> 1;
+                            dpp->samples_A [2] = (3 * dpp->samples_A [0] - dpp->samples_A [1]) >> 1;
+
+                        code -= (dpp->aweight_A = apply_weight (dpp->weight_A, dpp->samples_A [2]));
+                    }
+                    else
+                        code -= (dpp->aweight_A = apply_weight (dpp->weight_A, dpp->samples_A [m]));
+
+                max_magnitude |= (code < 0 ? ~code : code);
+                code = send_word (wps, code, 0);
+
+                while (--dpp >= wps->decorr_passes) {
+                    if (dpp->term > MAX_TERM) {
+                        update_weight (dpp->weight_A, dpp->delta, dpp->samples_A [2], code);
+                        dpp->samples_A [1] = dpp->samples_A [0];
+                        dpp->samples_A [0] = (code += dpp->aweight_A);
+                    }
+                    else {
+                        int32_t sam = dpp->samples_A [m];
+
+                        update_weight (dpp->weight_A, dpp->delta, sam, code);
+                        dpp->samples_A [(m + dpp->term) & (MAX_TERM - 1)] = (code += dpp->aweight_A);
+                    }
+                }
+
+                wps->dc.error [0] += code;
+                m = (m + 1) & (MAX_TERM - 1);
+
+                if ((crc += (crc << 1) + code) != crc2)
+                    lossy = TRUE;
+
+                if (wpc->config.flags & CONFIG_CALC_NOISE) {
+                    noise = code - bptr [-1];
+
+                    noise_acc += noise *= noise;
+                    wps->dc.noise_ave = (wps->dc.noise_ave * 0.99) + (noise * 0.01);
+
+                    if (wps->dc.noise_ave > wps->dc.noise_max)
+                        wps->dc.noise_max = wps->dc.noise_ave;
+                }
+            }
+
+        /////////////////// handle the lossy/hybrid stereo mode ///////////////////
+
+        else if ((flags & HYBRID_FLAG) && !(flags & MONO_DATA))
+            for (bptr = buffer, i = 0; i < sample_count; ++i) {
+                int32_t left, right, temp;
+                int shaping_weight;
+
+                left = *bptr++;
+                crc2 += (crc2 << 3) + (left << 1) + left + (right = *bptr++);
+
+                if (flags & HYBRID_SHAPE) {
+                    if (shaping_array)
+                        shaping_weight = *shaping_array++;
+                    else
+                        shaping_weight = (wps->dc.shaping_acc [0] += wps->dc.shaping_delta [0]) >> 16;
+
+                    temp = -apply_weight (shaping_weight, wps->dc.error [0]);
+
+                    if ((flags & NEW_SHAPING) && shaping_weight < 0 && temp) {
+                        if (temp == wps->dc.error [0])
+                            temp = (temp < 0) ? temp + 1 : temp - 1;
+
+                        wps->dc.error [0] = -left;
+                        left += temp;
+                    }
+                    else
+                        wps->dc.error [0] = -(left += temp);
+
+                    if (!shaping_array)
+                        shaping_weight = (wps->dc.shaping_acc [1] += wps->dc.shaping_delta [1]) >> 16;
+
+                    temp = -apply_weight (shaping_weight, wps->dc.error [1]);
+
+                    if ((flags & NEW_SHAPING) && shaping_weight < 0 && temp) {
+                        if (temp == wps->dc.error [1])
+                            temp = (temp < 0) ? temp + 1 : temp - 1;
+
+                        wps->dc.error [1] = -right;
+                        right += temp;
+                    }
+                    else
+                        wps->dc.error [1] = -(right += temp);
+                }
+
+                if (flags & JOINT_STEREO)
+                    right += ((left -= right) >> 1);
+
+                for (tcount = wps->num_terms, dpp = wps->decorr_passes; tcount-- ; dpp++)
+                    if (dpp->term > MAX_TERM) {
+                        if (dpp->term & 1) {
+                            dpp->samples_A [2] = 2 * dpp->samples_A [0] - dpp->samples_A [1];
+                            dpp->samples_B [2] = 2 * dpp->samples_B [0] - dpp->samples_B [1];
+                        }
+                        else {
+                            dpp->samples_A [2] = (3 * dpp->samples_A [0] - dpp->samples_A [1]) >> 1;
+                            dpp->samples_B [2] = (3 * dpp->samples_B [0] - dpp->samples_B [1]) >> 1;
+                        }
+
+                        left -= (dpp->aweight_A = apply_weight (dpp->weight_A, dpp->samples_A [2]));
+                        right -= (dpp->aweight_B = apply_weight (dpp->weight_B, dpp->samples_B [2]));
+                    }
+                    else if (dpp->term > 0) {
+                        left -= (dpp->aweight_A = apply_weight (dpp->weight_A, dpp->samples_A [m]));
+                        right -= (dpp->aweight_B = apply_weight (dpp->weight_B, dpp->samples_B [m]));
+                    }
+                    else {
+                        if (dpp->term == -1)
+                            dpp->samples_B [0] = left;
+                        else if (dpp->term == -2)
+                            dpp->samples_A [0] = right;
+
+                        left -= (dpp->aweight_A = apply_weight (dpp->weight_A, dpp->samples_A [0]));
+                        right -= (dpp->aweight_B = apply_weight (dpp->weight_B, dpp->samples_B [0]));
+                    }
+
+                max_magnitude |= (left < 0 ? ~left : left) | (right < 0 ? ~right : right);
+                left = send_word (wps, left, 0);
+                right = send_word (wps, right, 1);
+
+                while (--dpp >= wps->decorr_passes)
+                    if (dpp->term > MAX_TERM) {
+                        update_weight (dpp->weight_A, dpp->delta, dpp->samples_A [2], left);
+                        update_weight (dpp->weight_B, dpp->delta, dpp->samples_B [2], right);
 
                         dpp->samples_A [1] = dpp->samples_A [0];
-                        dpp->samples_A [0] = code;
+                        dpp->samples_B [1] = dpp->samples_B [0];
+
+                        dpp->samples_A [0] = (left += dpp->aweight_A);
+                        dpp->samples_B [0] = (right += dpp->aweight_B);
+                    }
+                    else if (dpp->term > 0) {
+                        int k = (m + dpp->term) & (MAX_TERM - 1);
+
+                        update_weight (dpp->weight_A, dpp->delta, dpp->samples_A [m], left);
+                        dpp->samples_A [k] = (left += dpp->aweight_A);
+
+                        update_weight (dpp->weight_B, dpp->delta, dpp->samples_B [m], right);
+                        dpp->samples_B [k] = (right += dpp->aweight_B);
                     }
                     else {
-                        sam = dpp->samples_A [m];
-                        dpp->samples_A [(m + dpp->term) & (MAX_TERM - 1)] = code;
+                        if (dpp->term == -1) {
+                            dpp->samples_B [0] = left + dpp->aweight_A;
+                            dpp->aweight_B = apply_weight (dpp->weight_B, dpp->samples_B [0]);
+                        }
+                        else if (dpp->term == -2) {
+                            dpp->samples_A [0] = right + dpp->aweight_B;
+                            dpp->aweight_A = apply_weight (dpp->weight_A, dpp->samples_A [0]);
+                        }
+
+                        update_weight_clip (dpp->weight_A, dpp->delta, dpp->samples_A [0], left);
+                        update_weight_clip (dpp->weight_B, dpp->delta, dpp->samples_B [0], right);
+                        dpp->samples_B [0] = (left += dpp->aweight_A);
+                        dpp->samples_A [0] = (right += dpp->aweight_B);
                     }
 
-                    code -= apply_weight (dpp->weight_A, sam);
-                    update_weight (dpp->weight_A, dpp->delta, sam, code);
-                }
+                if (flags & JOINT_STEREO)
+                    left += (right -= (left >> 1));
 
+                wps->dc.error [0] += left;
+                wps->dc.error [1] += right;
                 m = (m + 1) & (MAX_TERM - 1);
-                *bptr++ = code;
-            }
 
-        send_words_lossless (wps, buffer, sample_count);
-    }
+                if ((crc += (crc << 3) + (left << 1) + left + right) != crc2)
+                    lossy = TRUE;
 
-    //////////////////// handle the lossless stereo mode //////////////////////
+                if (wpc->config.flags & CONFIG_CALC_NOISE) {
+                    noise = (double)(left - bptr [-2]) * (left - bptr [-2]);
+                    noise += (double)(right - bptr [-1]) * (right - bptr [-1]);
 
-    else if (!(flags & HYBRID_FLAG) && !(flags & MONO_DATA)) {
-        int32_t *eptr = buffer + (sample_count * 2);
+                    noise_acc += noise /= 2.0;
+                    wps->dc.noise_ave = (wps->dc.noise_ave * 0.99) + (noise * 0.01);
 
-        if (!wps->num_passes) {
-            if (flags & JOINT_STEREO)
-                for (bptr = buffer; bptr < eptr; bptr += 2)
-                    bptr [1] += ((bptr [0] -= bptr [1]) >> 1);
-
-            for (tcount = wps->num_terms, dpp = wps->decorr_passes; tcount-- ; dpp++)
-                if (((flags & MAG_MASK) >> MAG_LSB) >= 16 || dpp->delta != 2)
-                    decorr_stereo_pass (dpp, buffer, sample_count);
-                else
-                    decorr_stereo_pass_id2 (dpp, buffer, sample_count);
-        }
-
-        send_words_lossless (wps, buffer, sample_count);
-    }
-
-    /////////////////// handle the lossy/hybrid mono mode /////////////////////
-
-    else if ((flags & HYBRID_FLAG) && (flags & MONO_DATA))
-        for (bptr = buffer, i = 0; i < sample_count; ++i) {
-            int32_t code, temp;
-            int shaping_weight;
-
-            crc2 += (crc2 << 1) + (code = *bptr++);
-
-            if (flags & HYBRID_SHAPE) {
-                if (shaping_array)
-                    shaping_weight = *shaping_array++;
-                else
-                    shaping_weight = (wps->dc.shaping_acc [0] += wps->dc.shaping_delta [0]) >> 16;
-
-                temp = -apply_weight (shaping_weight, wps->dc.error [0]);
-
-                if ((flags & NEW_SHAPING) && shaping_weight < 0 && temp) {
-                    if (temp == wps->dc.error [0])
-                        temp = (temp < 0) ? temp + 1 : temp - 1;
-
-                    wps->dc.error [0] = -code;
-                    code += temp;
-                }
-                else
-                    wps->dc.error [0] = -(code += temp);
-            }
-
-            for (tcount = wps->num_terms, dpp = wps->decorr_passes; tcount-- ; dpp++)
-                if (dpp->term > MAX_TERM) {
-                    if (dpp->term & 1)
-                        dpp->samples_A [2] = 2 * dpp->samples_A [0] - dpp->samples_A [1];
-                    else
-                        dpp->samples_A [2] = (3 * dpp->samples_A [0] - dpp->samples_A [1]) >> 1;
-
-                    code -= (dpp->aweight_A = apply_weight (dpp->weight_A, dpp->samples_A [2]));
-                }
-                else
-                    code -= (dpp->aweight_A = apply_weight (dpp->weight_A, dpp->samples_A [m]));
-
-            code = send_word (wps, code, 0);
-
-            while (--dpp >= wps->decorr_passes) {
-                if (dpp->term > MAX_TERM) {
-                    update_weight (dpp->weight_A, dpp->delta, dpp->samples_A [2], code);
-                    dpp->samples_A [1] = dpp->samples_A [0];
-                    dpp->samples_A [0] = (code += dpp->aweight_A);
-                }
-                else {
-                    int32_t sam = dpp->samples_A [m];
-
-                    update_weight (dpp->weight_A, dpp->delta, sam, code);
-                    dpp->samples_A [(m + dpp->term) & (MAX_TERM - 1)] = (code += dpp->aweight_A);
+                    if (wps->dc.noise_ave > wps->dc.noise_max)
+                        wps->dc.noise_max = wps->dc.noise_ave;
                 }
             }
 
-            wps->dc.error [0] += code;
-            m = (m + 1) & (MAX_TERM - 1);
+        if (m)
+            for (tcount = wps->num_terms, dpp = wps->decorr_passes; tcount--; dpp++)
+                if (dpp->term > 0 && dpp->term <= MAX_TERM) {
+                    int32_t temp_A [MAX_TERM], temp_B [MAX_TERM];
+                    int k;
 
-            if ((crc += (crc << 1) + code) != crc2)
-                lossy = TRUE;
+                    memcpy (temp_A, dpp->samples_A, sizeof (dpp->samples_A));
+                    memcpy (temp_B, dpp->samples_B, sizeof (dpp->samples_B));
 
-            if (wpc->config.flags & CONFIG_CALC_NOISE) {
-                noise = code - bptr [-1];
-
-                noise_acc += noise *= noise;
-                wps->dc.noise_ave = (wps->dc.noise_ave * 0.99) + (noise * 0.01);
-
-                if (wps->dc.noise_ave > wps->dc.noise_max)
-                    wps->dc.noise_max = wps->dc.noise_ave;
-            }
-        }
-
-    /////////////////// handle the lossy/hybrid stereo mode ///////////////////
-
-    else if ((flags & HYBRID_FLAG) && !(flags & MONO_DATA))
-        for (bptr = buffer, i = 0; i < sample_count; ++i) {
-            int32_t left, right, temp;
-            int shaping_weight;
-
-            left = *bptr++;
-            crc2 += (crc2 << 3) + (left << 1) + left + (right = *bptr++);
-
-            if (flags & HYBRID_SHAPE) {
-                if (shaping_array)
-                    shaping_weight = *shaping_array++;
-                else
-                    shaping_weight = (wps->dc.shaping_acc [0] += wps->dc.shaping_delta [0]) >> 16;
-
-                temp = -apply_weight (shaping_weight, wps->dc.error [0]);
-
-                if ((flags & NEW_SHAPING) && shaping_weight < 0 && temp) {
-                    if (temp == wps->dc.error [0])
-                        temp = (temp < 0) ? temp + 1 : temp - 1;
-
-                    wps->dc.error [0] = -left;
-                    left += temp;
-                }
-                else
-                    wps->dc.error [0] = -(left += temp);
-
-                if (!shaping_array)
-                    shaping_weight = (wps->dc.shaping_acc [1] += wps->dc.shaping_delta [1]) >> 16;
-
-                temp = -apply_weight (shaping_weight, wps->dc.error [1]);
-
-                if ((flags & NEW_SHAPING) && shaping_weight < 0 && temp) {
-                    if (temp == wps->dc.error [1])
-                        temp = (temp < 0) ? temp + 1 : temp - 1;
-
-                    wps->dc.error [1] = -right;
-                    right += temp;
-                }
-                else
-                    wps->dc.error [1] = -(right += temp);
-            }
-
-            if (flags & JOINT_STEREO)
-                right += ((left -= right) >> 1);
-
-            for (tcount = wps->num_terms, dpp = wps->decorr_passes; tcount-- ; dpp++)
-                if (dpp->term > MAX_TERM) {
-                    if (dpp->term & 1) {
-                        dpp->samples_A [2] = 2 * dpp->samples_A [0] - dpp->samples_A [1];
-                        dpp->samples_B [2] = 2 * dpp->samples_B [0] - dpp->samples_B [1];
+                    for (k = 0; k < MAX_TERM; k++) {
+                        dpp->samples_A [k] = temp_A [m];
+                        dpp->samples_B [k] = temp_B [m];
+                        m = (m + 1) & (MAX_TERM - 1);
                     }
-                    else {
-                        dpp->samples_A [2] = (3 * dpp->samples_A [0] - dpp->samples_A [1]) >> 1;
-                        dpp->samples_B [2] = (3 * dpp->samples_B [0] - dpp->samples_B [1]) >> 1;
-                    }
-
-                    left -= (dpp->aweight_A = apply_weight (dpp->weight_A, dpp->samples_A [2]));
-                    right -= (dpp->aweight_B = apply_weight (dpp->weight_B, dpp->samples_B [2]));
-                }
-                else if (dpp->term > 0) {
-                    left -= (dpp->aweight_A = apply_weight (dpp->weight_A, dpp->samples_A [m]));
-                    right -= (dpp->aweight_B = apply_weight (dpp->weight_B, dpp->samples_B [m]));
-                }
-                else {
-                    if (dpp->term == -1)
-                        dpp->samples_B [0] = left;
-                    else if (dpp->term == -2)
-                        dpp->samples_A [0] = right;
-
-                    left -= (dpp->aweight_A = apply_weight (dpp->weight_A, dpp->samples_A [0]));
-                    right -= (dpp->aweight_B = apply_weight (dpp->weight_B, dpp->samples_B [0]));
                 }
 
-            left = send_word (wps, left, 0);
-            right = send_word (wps, right, 1);
+        if (wpc->config.flags & CONFIG_CALC_NOISE)
+            wps->dc.noise_sum += noise_acc;
 
-            while (--dpp >= wps->decorr_passes)
-                if (dpp->term > MAX_TERM) {
-                    update_weight (dpp->weight_A, dpp->delta, dpp->samples_A [2], left);
-                    update_weight (dpp->weight_B, dpp->delta, dpp->samples_B [2], right);
+        flush_word (wps);
+        data_count = bs_close_write (&wps->wvbits);
 
-                    dpp->samples_A [1] = dpp->samples_A [0];
-                    dpp->samples_B [1] = dpp->samples_B [0];
-
-                    dpp->samples_A [0] = (left += dpp->aweight_A);
-                    dpp->samples_B [0] = (right += dpp->aweight_B);
-                }
-                else if (dpp->term > 0) {
-                    int k = (m + dpp->term) & (MAX_TERM - 1);
-
-                    update_weight (dpp->weight_A, dpp->delta, dpp->samples_A [m], left);
-                    dpp->samples_A [k] = (left += dpp->aweight_A);
-
-                    update_weight (dpp->weight_B, dpp->delta, dpp->samples_B [m], right);
-                    dpp->samples_B [k] = (right += dpp->aweight_B);
-                }
-                else {
-                    if (dpp->term == -1) {
-                        dpp->samples_B [0] = left + dpp->aweight_A;
-                        dpp->aweight_B = apply_weight (dpp->weight_B, dpp->samples_B [0]);
-                    }
-                    else if (dpp->term == -2) {
-                        dpp->samples_A [0] = right + dpp->aweight_B;
-                        dpp->aweight_A = apply_weight (dpp->weight_A, dpp->samples_A [0]);
-                    }
-
-                    update_weight_clip (dpp->weight_A, dpp->delta, dpp->samples_A [0], left);
-                    update_weight_clip (dpp->weight_B, dpp->delta, dpp->samples_B [0], right);
-                    dpp->samples_B [0] = (left += dpp->aweight_A);
-                    dpp->samples_A [0] = (right += dpp->aweight_B);
-                }
-
-            if (flags & JOINT_STEREO)
-                left += (right -= (left >> 1));
-
-            wps->dc.error [0] += left;
-            wps->dc.error [1] += right;
-            m = (m + 1) & (MAX_TERM - 1);
-
-            if ((crc += (crc << 3) + (left << 1) + left + right) != crc2)
-                lossy = TRUE;
-
-            if (wpc->config.flags & CONFIG_CALC_NOISE) {
-                noise = (double)(left - bptr [-2]) * (left - bptr [-2]);
-                noise += (double)(right - bptr [-1]) * (right - bptr [-1]);
-
-                noise_acc += noise /= 2.0;
-                wps->dc.noise_ave = (wps->dc.noise_ave * 0.99) + (noise * 0.01);
-
-                if (wps->dc.noise_ave > wps->dc.noise_max)
-                    wps->dc.noise_max = wps->dc.noise_ave;
-            }
-        }
-
-    if (m)
-        for (tcount = wps->num_terms, dpp = wps->decorr_passes; tcount--; dpp++)
-            if (dpp->term > 0 && dpp->term <= MAX_TERM) {
-                int32_t temp_A [MAX_TERM], temp_B [MAX_TERM];
-                int k;
-
-                memcpy (temp_A, dpp->samples_A, sizeof (dpp->samples_A));
-                memcpy (temp_B, dpp->samples_B, sizeof (dpp->samples_B));
-
-                for (k = 0; k < MAX_TERM; k++) {
-                    dpp->samples_A [k] = temp_A [m];
-                    dpp->samples_B [k] = temp_B [m];
-                    m = (m + 1) & (MAX_TERM - 1);
-                }
-            }
-
-    if (wpc->config.flags & CONFIG_CALC_NOISE)
-        wps->dc.noise_sum += noise_acc;
-
-    flush_word (wps);
-    data_count = bs_close_write (&wps->wvbits);
-
-    if (data_count) {
-        if (data_count != (uint32_t) -1) {
-            unsigned char *cptr = wps->blockbuff + ((WavpackHeader *) wps->blockbuff)->ckSize + 8;
-
-            *cptr++ = ID_WV_BITSTREAM | ID_LARGE;
-            *cptr++ = data_count >> 1;
-            *cptr++ = data_count >> 9;
-            *cptr++ = data_count >> 17;
-            ((WavpackHeader *) wps->blockbuff)->ckSize += data_count + 4;
-        }
-        else
-            return FALSE;
-    }
-
-    ((WavpackHeader *) wps->blockbuff)->crc = crc;
-
-    if (wpc->wvc_flag) {
-        data_count = bs_close_write (&wps->wvcbits);
-
-        if (data_count && lossy) {
+        if (data_count) {
             if (data_count != (uint32_t) -1) {
-                unsigned char *cptr = wps->block2buff + ((WavpackHeader *) wps->block2buff)->ckSize + 8;
+                unsigned char *cptr = wps->blockbuff + ((WavpackHeader *) wps->blockbuff)->ckSize + 8;
 
-                *cptr++ = ID_WVC_BITSTREAM | ID_LARGE;
+                *cptr++ = ID_WV_BITSTREAM | ID_LARGE;
                 *cptr++ = data_count >> 1;
                 *cptr++ = data_count >> 9;
                 *cptr++ = data_count >> 17;
-                ((WavpackHeader *) wps->block2buff)->ckSize += data_count + 4;
+                ((WavpackHeader *) wps->blockbuff)->ckSize += data_count + 4;
             }
             else
                 return FALSE;
         }
 
-        ((WavpackHeader *) wps->block2buff)->crc = crc2;
-    }
-    else if (lossy)
-        wpc->lossy_blocks = TRUE;
+        ((WavpackHeader *) wps->blockbuff)->crc = crc;
+
+        if (wpc->wvc_flag) {
+            data_count = bs_close_write (&wps->wvcbits);
+
+            if (data_count && lossy) {
+                if (data_count != (uint32_t) -1) {
+                    unsigned char *cptr = wps->block2buff + ((WavpackHeader *) wps->block2buff)->ckSize + 8;
+
+                    *cptr++ = ID_WVC_BITSTREAM | ID_LARGE;
+                    *cptr++ = data_count >> 1;
+                    *cptr++ = data_count >> 9;
+                    *cptr++ = data_count >> 17;
+                    ((WavpackHeader *) wps->block2buff)->ckSize += data_count + 4;
+                }
+                else
+                    return FALSE;
+            }
+
+            ((WavpackHeader *) wps->block2buff)->crc = crc2;
+        }
+        else if (lossy)
+            wpc->lossy_blocks = TRUE;
+
+        // we're done with the entire block, so now we check if our threshold for a "repack" was hit
+
+        if (repack_possible && wps->num_terms > REPACK_SAFE_NUM_TERMS && (max_magnitude & repack_mask)) {
+            *wps = saved_stream;
+            wps->num_terms = REPACK_SAFE_NUM_TERMS;
+            memcpy (wps->blockbuff, &wps->wphdr, sizeof (WavpackHeader));
+
+            if (saved_buffer)
+                memcpy (buffer, saved_buffer, sample_count * sizeof (int32_t) * (flags & MONO_DATA ? 1 : 2));
+
+            if (flags & HYBRID_FLAG)
+                crc = crc2 = 0xffffffff;
+        }
+        else {
+            // if we actually did repack the block with fewer terms, we detect that here
+            // and clean up so that we return to the original term count...otherwise we just
+            // free the saved_buffer (if allocated) and break out of the loop
+            if (wps->num_terms != saved_stream.num_terms) {
+                int ti;
+
+                for (ti = wps->num_terms; ti < saved_stream.num_terms; ++ti) {
+                    wps->decorr_passes [ti].weight_A = wps->decorr_passes [ti].weight_B = 0;
+                    CLEAR (wps->decorr_passes [ti].samples_A);
+                    CLEAR (wps->decorr_passes [ti].samples_B);
+                }
+
+                wps->num_terms = saved_stream.num_terms;
+            }
+
+            if (saved_buffer)
+                free (saved_buffer);
+
+            break;
+        }
+
+    } while (1);
 
     wps->sample_index += sample_count;
     return TRUE;
 }
 
-// Perform a pass of the stereo decorrelation as specified by the referenced
-// dpp structure. This version is optimized for samples that can use the
-// simple apply_weight macro (i.e. <= 16-bit audio) and for when the weight
-// delta is 2 (which is the case with all the default, non -x modes). For
-// cases that do not fit this model, the more general decorr_stereo_pass()
-// is provided. Note that this function returns the dpp->samples_X[] values
-// in the "normalized" positions for terms 1-8.
+#if !defined(OPT_ASM_X64)
 
-static void decorr_stereo_pass_id2 (struct decorr_pass *dpp, int32_t *buffer, int32_t sample_count)
-{
-    int32_t *bptr, *eptr = buffer + (sample_count * 2);
-    int m, k;
+// This is the "C" version of the stereo decorrelation pass function. There
+// are assembly optimized versions of this that can be used if available.
+// It performs a single pass of stereo decorrelation, in place, as specified
+// by the decorr_pass structure. Note that this function does NOT return the
+// dpp->samples_X[] values in the "normalized" positions for terms 1-8, so if
+// the number of samples is not a multiple of MAX_TERM, these must be moved if
+// they are to be used somewhere else.
 
-    switch (dpp->term) {
-        case 17:
-            for (bptr = buffer; bptr < eptr; bptr += 2) {
-                int32_t sam, tmp;
-
-                sam = 2 * dpp->samples_A [0] - dpp->samples_A [1];
-                dpp->samples_A [1] = dpp->samples_A [0];
-                bptr [0] = tmp = (dpp->samples_A [0] = bptr [0]) - apply_weight_i (dpp->weight_A, sam);
-                update_weight_d2 (dpp->weight_A, dpp->delta, sam, tmp);
-
-                sam = 2 * dpp->samples_B [0] - dpp->samples_B [1];
-                dpp->samples_B [1] = dpp->samples_B [0];
-                bptr [1] = tmp = (dpp->samples_B [0] = bptr [1]) - apply_weight_i (dpp->weight_B, sam);
-                update_weight_d2 (dpp->weight_B, dpp->delta, sam, tmp);
-            }
-
-            break;
-
-        case 18:
-            for (bptr = buffer; bptr < eptr; bptr += 2) {
-                int32_t sam, tmp;
-
-                sam = dpp->samples_A [0] + ((dpp->samples_A [0] - dpp->samples_A [1]) >> 1);
-                dpp->samples_A [1] = dpp->samples_A [0];
-                bptr [0] = tmp = (dpp->samples_A [0] = bptr [0]) - apply_weight_i (dpp->weight_A, sam);
-                update_weight_d2 (dpp->weight_A, dpp->delta, sam, tmp);
-
-                sam = dpp->samples_B [0] + ((dpp->samples_B [0] - dpp->samples_B [1]) >> 1);
-                dpp->samples_B [1] = dpp->samples_B [0];
-                bptr [1] = tmp = (dpp->samples_B [0] = bptr [1]) - apply_weight_i (dpp->weight_B, sam);
-                update_weight_d2 (dpp->weight_B, dpp->delta, sam, tmp);
-            }
-
-            break;
-
-        default:
-            for (m = 0, k = dpp->term & (MAX_TERM - 1), bptr = buffer; bptr < eptr; bptr += 2) {
-                int32_t sam, tmp;
-
-                sam = dpp->samples_A [m];
-                bptr [0] = tmp = (dpp->samples_A [k] = bptr [0]) - apply_weight_i (dpp->weight_A, sam);
-                update_weight_d2 (dpp->weight_A, dpp->delta, sam, tmp);
-
-                sam = dpp->samples_B [m];
-                bptr [1] = tmp = (dpp->samples_B [k] = bptr [1]) - apply_weight_i (dpp->weight_B, sam);
-                update_weight_d2 (dpp->weight_B, dpp->delta, sam, tmp);
-
-                m = (m + 1) & (MAX_TERM - 1);
-                k = (k + 1) & (MAX_TERM - 1);
-            }
-
-            if (m) {
-                int32_t temp_A [MAX_TERM], temp_B [MAX_TERM];
-
-                memcpy (temp_A, dpp->samples_A, sizeof (dpp->samples_A));
-                memcpy (temp_B, dpp->samples_B, sizeof (dpp->samples_B));
-
-                for (k = 0; k < MAX_TERM; k++) {
-                    dpp->samples_A [k] = temp_A [m];
-                    dpp->samples_B [k] = temp_B [m];
-                    m = (m + 1) & (MAX_TERM - 1);
-                }
-            }
-
-            break;
-
-        case -1:
-            for (bptr = buffer; bptr < eptr; bptr += 2) {
-                int32_t sam_A, sam_B, tmp;
-
-                sam_A = dpp->samples_A [0];
-                bptr [0] = tmp = (sam_B = bptr [0]) - apply_weight_i (dpp->weight_A, sam_A);
-                update_weight_clip_d2 (dpp->weight_A, dpp->delta, sam_A, tmp);
-
-                bptr [1] = tmp = (dpp->samples_A [0] = bptr [1]) - apply_weight_i (dpp->weight_B, sam_B);
-                update_weight_clip_d2 (dpp->weight_B, dpp->delta, sam_B, tmp);
-            }
-
-            break;
-
-        case -2:
-            for (bptr = buffer; bptr < eptr; bptr += 2) {
-                int32_t sam_A, sam_B, tmp;
-
-                sam_B = dpp->samples_B [0];
-                bptr [1] = tmp = (sam_A = bptr [1]) - apply_weight_i (dpp->weight_B, sam_B);
-                update_weight_clip_d2 (dpp->weight_B, dpp->delta, sam_B, tmp);
-
-                bptr [0] = tmp = (dpp->samples_B [0] = bptr [0]) - apply_weight_i (dpp->weight_A, sam_A);
-                update_weight_clip_d2 (dpp->weight_A, dpp->delta, sam_A, tmp);
-            }
-
-            break;
-
-        case -3:
-            for (bptr = buffer; bptr < eptr; bptr += 2) {
-                int32_t sam_A, sam_B, tmp;
-
-                sam_A = dpp->samples_A [0];
-                sam_B = dpp->samples_B [0];
-
-                dpp->samples_A [0] = tmp = bptr [1];
-                bptr [1] = tmp -= apply_weight_i (dpp->weight_B, sam_B);
-                update_weight_clip_d2 (dpp->weight_B, dpp->delta, sam_B, tmp);
-
-                dpp->samples_B [0] = tmp = bptr [0];
-                bptr [0] = tmp -= apply_weight_i (dpp->weight_A, sam_A);
-                update_weight_clip_d2 (dpp->weight_A, dpp->delta, sam_A, tmp);
-            }
-
-            break;
-    }
-}
-
-// Perform a pass of the stereo decorrelation as specified by the referenced
-// dpp structure. This function is provided in both a regular C version and
-// an MMX version (using intrinsics) written by Joachim Henke. The MMX version
-// is significantly faster when the sample data requires the full-resolution
-// apply_weight macro. However, when the data is lower resolution (<= 16-bit)
-// then the difference is slight (or the MMX is even slower), so for these
-// cases the simpler decorr_stereo_pass_id2() is used. Note that this function
-// returns the dpp->samples_X[] values in the "normalized" positions for
-// terms 1-8.
-
-#ifdef OPT_MMX
-
-static void decorr_stereo_pass (struct decorr_pass *dpp, int32_t *buffer, int32_t sample_count)
-{
-    const __m64
-        delta = _mm_set1_pi32 (dpp->delta),
-        fill = _mm_set1_pi32 (0x7bff),
-        mask = _mm_set1_pi32 (0x7fff),
-        round = _mm_set1_pi32 (512),
-        zero = _mm_set1_pi32 (0);
-    __m64
-        weight_AB = _mm_set_pi32 (restore_weight (store_weight (dpp->weight_B)), restore_weight (store_weight (dpp->weight_A))),
-        left_right, sam_AB, tmp0, tmp1, samples_AB [MAX_TERM];
-    int k, m = 0;
-
-    for (k = 0; k < MAX_TERM; ++k) {
-        ((int32_t *) samples_AB) [k * 2] = exp2s (log2s (dpp->samples_A [k]));
-        ((int32_t *) samples_AB) [k * 2 + 1] = exp2s (log2s (dpp->samples_B [k]));
-    }
-
-    if (dpp->term > 0) {
-        if (dpp->term == 17) {
-            while (sample_count--) {
-                left_right = *(__m64 *) buffer;
-                tmp0 = samples_AB [0];
-                sam_AB = _m_paddd (tmp0, tmp0);
-                sam_AB = _m_psubd (sam_AB, samples_AB [1]);
-                samples_AB [0] = left_right;
-                samples_AB [1] = tmp0;
-
-                tmp0 = _m_paddd (sam_AB, sam_AB);
-                tmp1 = _m_pand (sam_AB, mask);
-                tmp0 = _m_psrldi (tmp0, 16);
-                tmp1 = _m_pmaddwd (tmp1, weight_AB);
-                tmp0 = _m_pmaddwd (tmp0, weight_AB);
-                tmp1 = _m_paddd (tmp1, round);
-                tmp0 = _m_pslldi (tmp0, 5);
-                tmp1 = _m_psradi (tmp1, 10);
-                left_right = _m_psubd (left_right, tmp0);
-                left_right = _m_psubd (left_right, tmp1);
-
-                *(__m64 *) buffer = left_right;
-
-                tmp0 = _m_pxor (sam_AB, left_right);
-                tmp0 = _m_psradi (tmp0, 31);
-                tmp1 = _m_pxor (delta, tmp0);
-                tmp1 = _m_psubd (tmp1, tmp0);
-                sam_AB = _m_pcmpeqd (sam_AB, zero);
-                tmp0 = _m_pcmpeqd (left_right, zero);
-                tmp0 = _m_por (tmp0, sam_AB);
-                tmp0 = _m_pandn (tmp0, tmp1);
-                weight_AB = _m_paddd (weight_AB, tmp0);
-
-                buffer += 2;
-            }
-        }
-        else if (dpp->term == 18) {
-            while (sample_count--) {
-                left_right = *(__m64 *) buffer;
-                tmp0 = samples_AB [0];
-                sam_AB = _m_psubd (tmp0, samples_AB [1]);
-                sam_AB = _m_psradi (sam_AB, 1);
-                sam_AB = _m_paddd (sam_AB, tmp0);
-                samples_AB [0] = left_right;
-                samples_AB [1] = tmp0;
-
-                tmp0 = _m_paddd (sam_AB, sam_AB);
-                tmp1 = _m_pand (sam_AB, mask);
-                tmp0 = _m_psrldi (tmp0, 16);
-                tmp1 = _m_pmaddwd (tmp1, weight_AB);
-                tmp0 = _m_pmaddwd (tmp0, weight_AB);
-                tmp1 = _m_paddd (tmp1, round);
-                tmp0 = _m_pslldi (tmp0, 5);
-                tmp1 = _m_psradi (tmp1, 10);
-                left_right = _m_psubd (left_right, tmp0);
-                left_right = _m_psubd (left_right, tmp1);
-
-                *(__m64 *) buffer = left_right;
-
-                tmp0 = _m_pxor (sam_AB, left_right);
-                tmp0 = _m_psradi (tmp0, 31);
-                tmp1 = _m_pxor (delta, tmp0);
-                tmp1 = _m_psubd (tmp1, tmp0);
-                sam_AB = _m_pcmpeqd (sam_AB, zero);
-                tmp0 = _m_pcmpeqd (left_right, zero);
-                tmp0 = _m_por (tmp0, sam_AB);
-                tmp0 = _m_pandn (tmp0, tmp1);
-                weight_AB = _m_paddd (weight_AB, tmp0);
-
-                buffer += 2;
-            }
-        }
-        else {
-            k = dpp->term & (MAX_TERM - 1);
-            while (sample_count--) {
-                left_right = *(__m64 *) buffer;
-                sam_AB = samples_AB [m];
-                samples_AB [k] = left_right;
-
-                tmp0 = _m_paddd (sam_AB, sam_AB);
-                tmp1 = _m_pand (sam_AB, mask);
-                tmp0 = _m_psrldi (tmp0, 16);
-                tmp1 = _m_pmaddwd (tmp1, weight_AB);
-                tmp0 = _m_pmaddwd (tmp0, weight_AB);
-                tmp1 = _m_paddd (tmp1, round);
-                tmp0 = _m_pslldi (tmp0, 5);
-                tmp1 = _m_psradi (tmp1, 10);
-                left_right = _m_psubd (left_right, tmp0);
-                left_right = _m_psubd (left_right, tmp1);
-
-                *(__m64 *) buffer = left_right;
-
-                tmp0 = _m_pxor (sam_AB, left_right);
-                tmp0 = _m_psradi (tmp0, 31);
-                tmp1 = _m_pxor (delta, tmp0);
-                tmp1 = _m_psubd (tmp1, tmp0);
-                sam_AB = _m_pcmpeqd (sam_AB, zero);
-                tmp0 = _m_pcmpeqd (left_right, zero);
-                tmp0 = _m_por (tmp0, sam_AB);
-                tmp0 = _m_pandn (tmp0, tmp1);
-                weight_AB = _m_paddd (weight_AB, tmp0);
-
-                buffer += 2;
-                k = (k + 1) & (MAX_TERM - 1);
-                m = (m + 1) & (MAX_TERM - 1);
-            }
-        }
-    }
-    else {
-        if (dpp->term == -1) {
-            while (sample_count--) {
-                left_right = *(__m64 *) buffer;
-                sam_AB = samples_AB [0];
-                samples_AB [0] = _m_punpckhdq (left_right, sam_AB);
-                sam_AB = _m_punpckldq (sam_AB, left_right);
-
-                tmp0 = _m_paddd (sam_AB, sam_AB);
-                tmp1 = _m_pand (sam_AB, mask);
-                tmp0 = _m_psrldi (tmp0, 16);
-                tmp1 = _m_pmaddwd (tmp1, weight_AB);
-                tmp0 = _m_pmaddwd (tmp0, weight_AB);
-                tmp1 = _m_paddd (tmp1, round);
-                tmp0 = _m_pslldi (tmp0, 5);
-                tmp1 = _m_psradi (tmp1, 10);
-                left_right = _m_psubd (left_right, tmp0);
-                left_right = _m_psubd (left_right, tmp1);
-
-                *(__m64 *) buffer = left_right;
-
-                tmp0 = _m_pcmpeqd (sam_AB, zero);
-                tmp1 = _m_pcmpeqd (left_right, zero);
-                tmp0 = _m_por (tmp0, tmp1);
-                tmp0 = _m_pandn (tmp0, delta);
-                sam_AB = _m_pxor (sam_AB, left_right);
-                sam_AB = _m_psradi (sam_AB, 31);
-                tmp1 = _m_psubd (fill, sam_AB);
-                weight_AB = _m_pxor (weight_AB, sam_AB);
-                weight_AB = _m_paddd (weight_AB, tmp1);
-                weight_AB = _m_paddsw (weight_AB, tmp0);
-                weight_AB = _m_psubd (weight_AB, tmp1);
-                weight_AB = _m_pxor (weight_AB, sam_AB);
-
-                buffer += 2;
-            }
-        }
-        else if (dpp->term == -2) {
-            while (sample_count--) {
-                left_right = *(__m64 *) buffer;
-                sam_AB = samples_AB [0];
-                samples_AB [0] = _m_punpckldq (sam_AB, left_right);
-                sam_AB = _m_punpckhdq (left_right, sam_AB);
-
-                tmp0 = _m_paddd (sam_AB, sam_AB);
-                tmp1 = _m_pand (sam_AB, mask);
-                tmp0 = _m_psrldi (tmp0, 16);
-                tmp1 = _m_pmaddwd (tmp1, weight_AB);
-                tmp0 = _m_pmaddwd (tmp0, weight_AB);
-                tmp1 = _m_paddd (tmp1, round);
-                tmp0 = _m_pslldi (tmp0, 5);
-                tmp1 = _m_psradi (tmp1, 10);
-                left_right = _m_psubd (left_right, tmp0);
-                left_right = _m_psubd (left_right, tmp1);
-
-                *(__m64 *) buffer = left_right;
-
-                tmp0 = _m_pcmpeqd (sam_AB, zero);
-                tmp1 = _m_pcmpeqd (left_right, zero);
-                tmp0 = _m_por (tmp0, tmp1);
-                tmp0 = _m_pandn (tmp0, delta);
-                sam_AB = _m_pxor (sam_AB, left_right);
-                sam_AB = _m_psradi (sam_AB, 31);
-                tmp1 = _m_psubd (fill, sam_AB);
-                weight_AB = _m_pxor (weight_AB, sam_AB);
-                weight_AB = _m_paddd (weight_AB, tmp1);
-                weight_AB = _m_paddsw (weight_AB, tmp0);
-                weight_AB = _m_psubd (weight_AB, tmp1);
-                weight_AB = _m_pxor (weight_AB, sam_AB);
-
-                buffer += 2;
-            }
-        }
-        else if (dpp->term == -3) {
-            while (sample_count--) {
-                left_right = *(__m64 *) buffer;
-                sam_AB = samples_AB [0];
-                tmp0 = _m_punpckhdq (left_right, left_right);
-                samples_AB [0] = _m_punpckldq (tmp0, left_right);
-
-                tmp0 = _m_paddd (sam_AB, sam_AB);
-                tmp1 = _m_pand (sam_AB, mask);
-                tmp0 = _m_psrldi (tmp0, 16);
-                tmp1 = _m_pmaddwd (tmp1, weight_AB);
-                tmp0 = _m_pmaddwd (tmp0, weight_AB);
-                tmp1 = _m_paddd (tmp1, round);
-                tmp0 = _m_pslldi (tmp0, 5);
-                tmp1 = _m_psradi (tmp1, 10);
-                left_right = _m_psubd (left_right, tmp0);
-                left_right = _m_psubd (left_right, tmp1);
-
-                *(__m64 *) buffer = left_right;
-
-                tmp0 = _m_pcmpeqd (sam_AB, zero);
-                tmp1 = _m_pcmpeqd (left_right, zero);
-                tmp0 = _m_por (tmp0, tmp1);
-                tmp0 = _m_pandn (tmp0, delta);
-                sam_AB = _m_pxor (sam_AB, left_right);
-                sam_AB = _m_psradi (sam_AB, 31);
-                tmp1 = _m_psubd (fill, sam_AB);
-                weight_AB = _m_pxor (weight_AB, sam_AB);
-                weight_AB = _m_paddd (weight_AB, tmp1);
-                weight_AB = _m_paddsw (weight_AB, tmp0);
-                weight_AB = _m_psubd (weight_AB, tmp1);
-                weight_AB = _m_pxor (weight_AB, sam_AB);
-
-                buffer += 2;
-            }
-        }
-    }
-
-    dpp->weight_A = ((int32_t *) &weight_AB) [0];
-    dpp->weight_B = ((int32_t *) &weight_AB) [1];
-
-    for (k = 0; k < MAX_TERM; ++k) {
-        dpp->samples_A [k] = ((int32_t *) samples_AB) [m * 2];
-        dpp->samples_B [k] = ((int32_t *) samples_AB) [m * 2 + 1];
-        m = (m + 1) & (MAX_TERM - 1);
-    }
-
-    _mm_empty ();
-}
-
-#else
-
-static void decorr_stereo_pass (struct decorr_pass *dpp, int32_t *buffer, int32_t sample_count)
+void decorr_stereo_pass (struct decorr_pass *dpp, int32_t *buffer, int32_t sample_count)
 {
     int32_t *bptr, *eptr = buffer + (sample_count * 2);
     int m, k;
@@ -2879,19 +1537,6 @@ static void decorr_stereo_pass (struct decorr_pass *dpp, int32_t *buffer, int32_
                 k = (k + 1) & (MAX_TERM - 1);
             }
 
-            if (m) {
-                int32_t temp_A [MAX_TERM], temp_B [MAX_TERM];
-
-                memcpy (temp_A, dpp->samples_A, sizeof (dpp->samples_A));
-                memcpy (temp_B, dpp->samples_B, sizeof (dpp->samples_B));
-
-                for (k = 0; k < MAX_TERM; k++) {
-                    dpp->samples_A [k] = temp_A [m];
-                    dpp->samples_B [k] = temp_B [m];
-                    m = (m + 1) & (MAX_TERM - 1);
-                }
-            }
-
             break;
 
         case -1:
@@ -2942,6 +1587,75 @@ static void decorr_stereo_pass (struct decorr_pass *dpp, int32_t *buffer, int32_
     }
 }
 
+// This is the "C" version of the magnitude scanning function. There are
+// assembly optimized versions of this that can be used if available. This
+// function scans a buffer of signed 32-bit ints and returns the magnitude
+// of the largest sample, with a power-of-two resolution. It might be more
+// useful to return the actual maximum absolute value (and this function
+// could do that without breaking anything), but that implementation would
+// likely be slower. Instead, this simply returns the "or" of all the
+// values "xor"d with their own sign.
+
+uint32_t scan_max_magnitude (int32_t *values, int32_t num_values)
+{
+    uint32_t magnitude = 0;
+
+    while (num_values--)
+        magnitude |= (*values < 0) ? ~*values++ : *values++;
+
+    return magnitude;
+}
+
+#endif
+
+#if !defined(OPT_ASM_X86) && !defined(OPT_ASM_X64)
+
+// This is the "C" version of the mono decorrelation pass function. There
+// are assembly optimized versions of this that are be used if available.
+// It decorrelates a buffer of mono samples, in place, as specified by the array
+// of decorr_pass structures. Note that this function does NOT return the
+// dpp->samples_X[] values in the "normalized" positions for terms 1-8, so if
+// the number of samples is not a multiple of MAX_TERM, these must be moved if
+// they are to be used somewhere else. The magnitude of the output samples is
+// accumulated and returned (see scan_max_magnitude() for more details).
+
+uint32_t decorr_mono_buffer (int32_t *buffer, struct decorr_pass *decorr_passes, int32_t num_terms, int32_t sample_count)
+{
+    uint32_t max_magnitude = 0;
+    struct decorr_pass *dpp;
+    int tcount, i;
+
+    for (i = 0; i < sample_count; ++i) {
+        int32_t code = *buffer;
+
+        for (tcount = num_terms, dpp = decorr_passes; tcount--; dpp++) {
+            int32_t sam;
+
+            if (dpp->term > MAX_TERM) {
+                if (dpp->term & 1)
+                    sam = 2 * dpp->samples_A [0] - dpp->samples_A [1];
+                else
+                    sam = (3 * dpp->samples_A [0] - dpp->samples_A [1]) >> 1;
+
+                dpp->samples_A [1] = dpp->samples_A [0];
+                dpp->samples_A [0] = code;
+            }
+            else {
+                sam = dpp->samples_A [i & (MAX_TERM - 1)];
+                dpp->samples_A [(i + dpp->term) & (MAX_TERM - 1)] = code;
+            }
+
+            code -= apply_weight (dpp->weight_A, sam);
+            update_weight (dpp->weight_A, dpp->delta, sam, code);
+        }
+
+        *buffer++ = code;
+        max_magnitude |= (code < 0) ? ~code : code;
+    }
+
+    return max_magnitude;
+}
+
 #endif
 
 //////////////////////////////////////////////////////////////////////////////
@@ -2961,44 +1675,52 @@ double WavpackGetEncodedNoise (WavpackContext *wpc, double *peak)
     return wps->dc.noise_sum;
 }
 
-// Given an array of integer data (in shorts), find the linear function that most closely
-// represents it (based on minimum sum of absolute errors). This is returned as the double
-// precision initial & final Y values of the best-fit line. The function can also optionally
-// compute and return a maximum error value (as a short). Note that the ends of the resulting
-// line may fall way outside the range of input values, so some sort of clipping may be
-// needed.
+// Open the specified BitStream using the specified buffer pointers. It is
+// assumed that enough buffer space has been allocated for all data that will
+// be written, otherwise an error will be generated.
 
-void best_floating_line (short *values, int num_values, double *initial_y, double *final_y, short *max_error)
+static void bs_write (Bitstream *bs);
+
+static void bs_open_write (Bitstream *bs, void *buffer_start, void *buffer_end)
 {
-    double left_sum = 0.0, right_sum = 0.0, center_x = (num_values - 1) / 2.0, center_y, m;
-    int i;
-
-    for (i = 0; i < num_values >> 1; ++i) {
-        right_sum += values [num_values - i - 1];
-        left_sum += values [i];
-    }
-
-    if (num_values & 1) {
-        right_sum += values [num_values >> 1] * 0.5;
-        left_sum += values [num_values >> 1] * 0.5;
-    }
-
-    center_y = (right_sum + left_sum) / num_values;
-    m = (right_sum - left_sum) / ((double) num_values * num_values) * 4.0;
-
-    if (initial_y)
-        *initial_y = center_y - m * center_x;
-
-    if (final_y)
-        *final_y = center_y + m * center_x;
-
-    if (max_error) {
-        double max = 0.0;
-
-        for (i = 0; i < num_values; ++i)
-            if (fabs (values [i] - (center_y + (i - center_x) * m)) > max)
-                max = fabs (values [i] - (center_y + (i - center_x) * m));
-
-        *max_error = (short) floor (max + 0.5);
-    }
+    bs->error = bs->sr = bs->bc = 0;
+    bs->ptr = bs->buf = buffer_start;
+    bs->end = buffer_end;
+    bs->wrap = bs_write;
+}
+
+// This function is only called from the putbit() and putbits() macros when
+// the buffer is full, which is now flagged as an error.
+
+static void bs_write (Bitstream *bs)
+{
+    bs->ptr = bs->buf;
+    bs->error = 1;
+}
+
+// This function forces a flushing write of the specified BitStream, and
+// returns the total number of bytes written into the buffer.
+
+static uint32_t bs_close_write (Bitstream *bs)
+{
+    uint32_t bytes_written;
+
+    if (bs->error)
+        return (uint32_t) -1;
+
+    while (1) {
+        while (bs->bc)
+            putbit_1 (bs);
+
+        bytes_written = (uint32_t)(bs->ptr - bs->buf) * sizeof (*(bs->ptr));
+
+        if (bytes_written & 1) {
+            putbit_1 (bs);
+        }
+        else
+            break;
+    };
+
+    CLEAR (*bs);
+    return bytes_written;
 }
diff --git a/third_party/wavpack/src/pack_dns.c b/third_party/wavpack/src/pack_dns.c
new file mode 100644
index 0000000..2c6f3c8
--- /dev/null
+++ b/third_party/wavpack/src/pack_dns.c
@@ -0,0 +1,191 @@
+////////////////////////////////////////////////////////////////////////////
+//                           **** WAVPACK ****                            //
+//                  Hybrid Lossless Wavefile Compressor                   //
+//              Copyright (c) 1998 - 2013 Conifer Software.               //
+//                          All Rights Reserved.                          //
+//      Distributed under the BSD Software License (see license.txt)      //
+////////////////////////////////////////////////////////////////////////////
+
+// pack_dns.c
+
+// This module handles the implementation of "dynamic noise shaping" which is
+// designed to move the spectrum of the quantization noise introduced by lossy
+// compression up or down in frequency so that it is more likely to be masked
+// by the source material.
+
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "wavpack_local.h"
+
+static void best_floating_line (short *values, int num_values, double *initial_y, double *final_y, short *max_error);
+
+void dynamic_noise_shaping (WavpackContext *wpc, int32_t *buffer, int shortening_allowed)
+{
+    WavpackStream *wps = wpc->streams [wpc->current_stream];
+    int32_t sample_count = wps->wphdr.block_samples;
+    struct decorr_pass *ap = &wps->analysis_pass;
+    uint32_t flags = wps->wphdr.flags;
+    int32_t *bptr, temp, sam;
+    short *swptr;
+    int sc;
+
+    if (!wps->num_terms && sample_count > 8) {
+        if (flags & MONO_DATA)
+            for (bptr = buffer + sample_count - 3, sc = sample_count - 2; sc--;) {
+                sam = (3 * bptr [1] - bptr [2]) >> 1;
+                temp = *bptr-- - apply_weight (ap->weight_A, sam);
+                update_weight (ap->weight_A, 2, sam, temp);
+            }
+        else
+            for (bptr = buffer + (sample_count - 3) * 2 + 1, sc = sample_count - 2; sc--;) {
+                sam = (3 * bptr [2] - bptr [4]) >> 1;
+                temp = *bptr-- - apply_weight (ap->weight_B, sam);
+                update_weight (ap->weight_B, 2, sam, temp);
+                sam = (3 * bptr [2] - bptr [4]) >> 1;
+                temp = *bptr-- - apply_weight (ap->weight_A, sam);
+                update_weight (ap->weight_A, 2, sam, temp);
+            }
+    }
+
+    if (sample_count > wps->dc.shaping_samples) {
+        sc = sample_count - wps->dc.shaping_samples;
+        swptr = wps->dc.shaping_data + wps->dc.shaping_samples;
+        bptr = buffer + wps->dc.shaping_samples * ((flags & MONO_DATA) ? 1 : 2);
+
+        if (flags & MONO_DATA)
+            while (sc--) {
+                sam = (3 * ap->samples_A [0] - ap->samples_A [1]) >> 1;
+                temp = *bptr - apply_weight (ap->weight_A, sam);
+                update_weight (ap->weight_A, 2, sam, temp);
+                ap->samples_A [1] = ap->samples_A [0];
+                ap->samples_A [0] = *bptr++;
+                *swptr++ = (ap->weight_A < 256) ? 1024 : 1536 - ap->weight_A * 2;
+            }
+        else
+            while (sc--) {
+                sam = (3 * ap->samples_A [0] - ap->samples_A [1]) >> 1;
+                temp = *bptr - apply_weight (ap->weight_A, sam);
+                update_weight (ap->weight_A, 2, sam, temp);
+                ap->samples_A [1] = ap->samples_A [0];
+                ap->samples_A [0] = *bptr++;
+
+                sam = (3 * ap->samples_B [0] - ap->samples_B [1]) >> 1;
+                temp = *bptr - apply_weight (ap->weight_B, sam);
+                update_weight (ap->weight_B, 2, sam, temp);
+                ap->samples_B [1] = ap->samples_B [0];
+                ap->samples_B [0] = *bptr++;
+
+                *swptr++ = (ap->weight_A + ap->weight_B < 512) ? 1024 : 1536 - ap->weight_A - ap->weight_B;
+            }
+
+        wps->dc.shaping_samples = sample_count;
+    }
+
+    if (wpc->wvc_flag) {
+        int max_allowed_error = 1000000 / wpc->ave_block_samples;
+        short max_error, trial_max_error;
+        double initial_y, final_y;
+
+        if (max_allowed_error < 128)
+            max_allowed_error = 128;
+
+        best_floating_line (wps->dc.shaping_data, sample_count, &initial_y, &final_y, &max_error);
+
+        if (shortening_allowed && max_error > max_allowed_error) {
+            int min_samples = 0, max_samples = sample_count, trial_count;
+            double trial_initial_y, trial_final_y;
+
+            while (1) {
+                trial_count = (min_samples + max_samples) / 2;
+
+                best_floating_line (wps->dc.shaping_data, trial_count, &trial_initial_y,
+                    &trial_final_y, &trial_max_error);
+
+                if (trial_max_error < max_allowed_error) {
+                    max_error = trial_max_error;
+                    min_samples = trial_count;
+                    initial_y = trial_initial_y;
+                    final_y = trial_final_y;
+                }
+                else
+                    max_samples = trial_count;
+
+                if (min_samples > 10000 || max_samples - min_samples < 2)
+                    break;
+            }
+
+            sample_count = min_samples;
+        }
+
+        if (initial_y < -512) initial_y = -512;
+        else if (initial_y > 1024) initial_y = 1024;
+
+        if (final_y < -512) final_y = -512;
+        else if (final_y > 1024) final_y = 1024;
+#if 0
+        error_line ("%.2f sec, sample count = %5d, max error = %3d, range = %5d, %5d, actual = %5d, %5d",
+            (double) wps->sample_index / wpc->config.sample_rate, sample_count, max_error,
+            (int) floor (initial_y), (int) floor (final_y),
+            wps->dc.shaping_data [0], wps->dc.shaping_data [sample_count-1]);
+#endif
+        if (sample_count != wps->wphdr.block_samples)
+            wps->wphdr.block_samples = sample_count;
+
+        if (wpc->wvc_flag) {
+            wps->dc.shaping_acc [0] = wps->dc.shaping_acc [1] = (int32_t) floor (initial_y * 65536.0 + 0.5);
+
+            wps->dc.shaping_delta [0] = wps->dc.shaping_delta [1] =
+                (int32_t) floor ((final_y - initial_y) / (sample_count - 1) * 65536.0 + 0.5);
+
+            wps->dc.shaping_array = NULL;
+        }
+        else
+            wps->dc.shaping_array = wps->dc.shaping_data;
+    }
+    else
+        wps->dc.shaping_array = wps->dc.shaping_data;
+}
+
+// Given an array of integer data (in shorts), find the linear function that most closely
+// represents it (based on minimum sum of absolute errors). This is returned as the double
+// precision initial & final Y values of the best-fit line. The function can also optionally
+// compute and return a maximum error value (as a short). Note that the ends of the resulting
+// line may fall way outside the range of input values, so some sort of clipping may be
+// needed.
+
+static void best_floating_line (short *values, int num_values, double *initial_y, double *final_y, short *max_error)
+{
+    double left_sum = 0.0, right_sum = 0.0, center_x = (num_values - 1) / 2.0, center_y, m;
+    int i;
+
+    for (i = 0; i < num_values >> 1; ++i) {
+        right_sum += values [num_values - i - 1];
+        left_sum += values [i];
+    }
+
+    if (num_values & 1) {
+        right_sum += values [num_values >> 1] * 0.5;
+        left_sum += values [num_values >> 1] * 0.5;
+    }
+
+    center_y = (right_sum + left_sum) / num_values;
+    m = (right_sum - left_sum) / ((double) num_values * num_values) * 4.0;
+
+    if (initial_y)
+        *initial_y = center_y - m * center_x;
+
+    if (final_y)
+        *final_y = center_y + m * center_x;
+
+    if (max_error) {
+        double max = 0.0;
+
+        for (i = 0; i < num_values; ++i)
+            if (fabs (values [i] - (center_y + (i - center_x) * m)) > max)
+                max = fabs (values [i] - (center_y + (i - center_x) * m));
+
+        *max_error = (short) floor (max + 0.5);
+    }
+}
diff --git a/third_party/wavpack/src/pack_dsd.c b/third_party/wavpack/src/pack_dsd.c
new file mode 100644
index 0000000..531f454
--- /dev/null
+++ b/third_party/wavpack/src/pack_dsd.c
@@ -0,0 +1,669 @@
+////////////////////////////////////////////////////////////////////////////
+//                           **** DSDPACK ****                            //
+//         Lossless DSD (Direct Stream Digital) Audio Compressor          //
+//                Copyright (c) 2013 - 2016 David Bryant.                 //
+//                          All Rights Reserved.                          //
+//      Distributed under the BSD Software License (see license.txt)      //
+////////////////////////////////////////////////////////////////////////////
+
+// pack_dsd.c
+
+// This module actually handles the compression of the DSD audio data.
+
+#ifdef ENABLE_DSD
+
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "wavpack_local.h"
+
+///////////////////////////// executable code ////////////////////////////////
+
+// This function initializes everything required to pack WavPack DSD bitstreams
+// and must be called BEFORE any other function in this module.
+
+void pack_dsd_init (WavpackContext *wpc)
+{
+    WavpackStream *wps = wpc->streams [wpc->current_stream];
+
+    wps->sample_index = 0;
+}
+
+// Pack an entire block of samples (either mono or stereo) into a completed
+// WavPack block. This function is actually a shell for pack_samples() and
+// performs tasks like handling any shift required by the format, preprocessing
+// of floating point data or integer data over 24 bits wide, and implementing
+// the "extra" mode (via the extra?.c modules). It is assumed that there is
+// sufficient space for the completed block at "wps->blockbuff" and that
+// "wps->blockend" points to the end of the available space. A return value of
+// FALSE indicates an error.
+
+// Pack an entire block of samples (either mono or stereo) into a completed
+// WavPack block. It is assumed that there is sufficient space for the
+// completed block at "wps->blockbuff" and that "wps->blockend" points to the
+// end of the available space. A return value of FALSE indicates an error.
+// Any unsent metadata is transmitted first, then required metadata for this
+// block is sent, and finally the compressed integer data is sent. If a "wpx"
+// stream is required for floating point data or large integer data, then this
+// must be handled outside this function. To find out how much data was written
+// the caller must look at the ckSize field of the written WavpackHeader, NOT
+// the one in the WavpackStream.
+
+static int encode_buffer_high (WavpackStream *wps, int32_t *buffer, int num_samples, unsigned char *destination);
+static int encode_buffer_fast (WavpackStream *wps, int32_t *buffer, int num_samples, unsigned char *destination);
+
+int pack_dsd_block (WavpackContext *wpc, int32_t *buffer)
+{
+    WavpackStream *wps = wpc->streams [wpc->current_stream];
+    uint32_t flags = wps->wphdr.flags, mult = wpc->dsd_multiplier, data_count;
+    uint32_t sample_count = wps->wphdr.block_samples;
+    unsigned char *dsd_encoding, dsd_power = 0;
+    int32_t res;
+
+    // This code scans stereo data to check whether it can be stored as mono data
+    // (i.e., all L/R samples identical).
+
+    if (!(flags & MONO_FLAG)) {
+        int32_t *sptr, *dptr, i;
+
+        for (sptr = buffer, i = 0; i < (int32_t) sample_count; sptr += 2, i++)
+            if ((sptr [0] ^ sptr [1]) & 0xff)
+                break;
+
+        if (i == sample_count) {
+            wps->wphdr.flags = flags |= FALSE_STEREO;
+            dptr = buffer;
+            sptr = buffer;
+
+            for (i = sample_count; i--; sptr++)
+                *dptr++ = *sptr++;
+        }
+        else
+            wps->wphdr.flags = flags &= ~FALSE_STEREO;
+    }
+
+    wps->wphdr.ckSize = sizeof (WavpackHeader) - 8;
+    memcpy (wps->blockbuff, &wps->wphdr, sizeof (WavpackHeader));
+
+    if (wpc->metacount) {
+        WavpackMetadata *wpmdp = wpc->metadata;
+
+        while (wpc->metacount) {
+            copy_metadata (wpmdp, wps->blockbuff, wps->blockend);
+            wpc->metabytes -= wpmdp->byte_length;
+            free_metadata (wpmdp++);
+            wpc->metacount--;
+        }
+
+        free (wpc->metadata);
+        wpc->metadata = NULL;
+    }
+
+    if (!sample_count)
+        return TRUE;
+
+    send_general_metadata (wpc);
+    memcpy (&wps->wphdr, wps->blockbuff, sizeof (WavpackHeader));
+
+    dsd_encoding = wps->blockbuff + ((WavpackHeader *) wps->blockbuff)->ckSize + 12;
+
+    while (mult >>= 1)
+        dsd_power++;
+
+    *dsd_encoding++ = dsd_power;
+
+    if (wpc->config.flags & CONFIG_HIGH_FLAG) {
+        int fast_res = encode_buffer_fast (wps, buffer, sample_count, dsd_encoding);
+
+        res = encode_buffer_high (wps, buffer, sample_count, dsd_encoding);
+
+        if ((fast_res != -1) && (res == -1 || res > fast_res))
+            res = encode_buffer_fast (wps, buffer, sample_count, dsd_encoding);
+    }
+    else
+        res = encode_buffer_fast (wps, buffer, sample_count, dsd_encoding);
+
+    if (res == -1) {
+        int num_samples = sample_count * ((flags & MONO_DATA) ? 1 : 2);
+        uint32_t crc = 0xffffffff;
+
+        *dsd_encoding++ = 0;
+
+        data_count = num_samples + 2;
+
+        while (num_samples--)
+            crc += (crc << 1) + (*dsd_encoding++ = *buffer++);
+
+        ((WavpackHeader *) wps->blockbuff)->crc = crc;
+    }
+    else
+        data_count = res + 1;
+
+    if (data_count) {
+        unsigned char *cptr = wps->blockbuff + ((WavpackHeader *) wps->blockbuff)->ckSize + 8;
+
+        if (data_count & 1) {
+            cptr [data_count + 4] = 0;
+            *cptr++ = ID_DSD_BLOCK | ID_LARGE | ID_ODD_SIZE;
+            data_count++;
+        }
+        else
+            *cptr++ = ID_DSD_BLOCK | ID_LARGE;
+
+        *cptr++ = data_count >> 1;
+        *cptr++ = data_count >> 9;
+        *cptr++ = data_count >> 17;
+        ((WavpackHeader *) wps->blockbuff)->ckSize += data_count + 4;
+    }
+
+    wps->sample_index += sample_count;
+    return TRUE;
+}
+
+/*------------------------------------------------------------------------------------------------------------------------*/
+
+// #define DSD_BYTE_READY(low,high) (((low) >> 24) == ((high) >> 24))
+// #define DSD_BYTE_READY(low,high) (!(((low) ^ (high)) >> 24))
+#define DSD_BYTE_READY(low,high) (!(((low) ^ (high)) & 0xff000000))
+
+#define MAX_HISTORY_BITS    5
+#define MAX_PROBABILITY     0xa0    // set to 0xff to disable RLE encoding for probabilities table
+
+#if (MAX_PROBABILITY < 0xff)
+
+static int rle_encode (unsigned char *src, int bcount, unsigned char *destination)
+{
+    int max_rle_zeros = 0xff - MAX_PROBABILITY;
+    unsigned char *dp = destination;
+    int zcount = 0;
+
+    while (bcount--) {
+        if (*src) {
+            while (zcount) {
+                *dp++ = MAX_PROBABILITY + (zcount > max_rle_zeros ? max_rle_zeros : zcount);
+                zcount -= (zcount > max_rle_zeros ? max_rle_zeros : zcount);
+            }
+
+            *dp++ = *src++;
+        }
+        else {
+            zcount++;
+            src++;
+        }
+    }
+
+    while (zcount) {
+        *dp++ = MAX_PROBABILITY + (zcount > max_rle_zeros ? max_rle_zeros : zcount);
+        zcount -= (zcount > max_rle_zeros ? max_rle_zeros : zcount);
+    }
+
+    *dp++ = 0;
+
+    return (int)(dp - destination);
+}
+
+#endif
+
+static void calculate_probabilities (int hist [256], unsigned char probs [256], unsigned short prob_sums [256])
+{
+    int divisor, min_value, max_value, sum_values;
+    int min_hits = 0x7fffffff, max_hits = 0, i;
+
+    for (i = 0; i < 256; ++i) {
+        if (hist [i] < min_hits) min_hits = hist [i];
+        if (hist [i] > max_hits) max_hits = hist [i];
+    }
+
+    if (max_hits == 0) {
+        memset (probs, 0, sizeof (*probs) * 256);
+        memset (prob_sums, 0, sizeof (*prob_sums) * 256);
+        return;
+    }
+
+//  fprintf (stderr, "process_histogram(): hits = %d to %d\n", min_hits, max_hits);
+
+    if (max_hits > MAX_PROBABILITY)
+        divisor = ((max_hits << 8) + (MAX_PROBABILITY >> 1)) / MAX_PROBABILITY;
+    else
+        divisor = 0;
+
+    while (1) {
+        min_value = 0x7fffffff; max_value = 0; sum_values = 0;
+
+        for (i = 0; i < 256; ++i) {
+            int value;
+
+            if (hist [i]) {
+                if (divisor) {
+                    if (!(value = ((hist [i] << 8) + (divisor >> 1)) / divisor))
+                        value = 1;
+                }
+                else
+                    value = hist [i];
+
+                if (value < min_value) min_value = value;
+                if (value > max_value) max_value = value;
+            }
+            else
+                value = 0;
+
+            prob_sums [i] = sum_values += value;
+            probs [i] = value;
+        }
+
+        if (max_value > MAX_PROBABILITY) {
+            divisor++;
+            continue;
+        }
+
+#if 0   // this code reduces probability values when they are completely redundant (i.e., common divisor), but
+        // this doesn't really happen often enough to make it worthwhile
+
+        if (min_value > 1) {
+            for (i = 0; i < 256; ++i)
+                if (probs [i] % min_value)
+                    break;
+
+            if (i == 256) {
+                for (i = 0; i < 256; ++i) {
+                    prob_sums [i] /= min_value;
+                    probs [i] /= min_value;
+                }
+
+                // fprintf (stderr, "fixed min_value = %d, divisor = %d, probs_sum = %d\n", min_value, divisor, prob_sums [255]);
+            }
+        }
+#endif
+
+        break;
+    }
+}
+
+static int encode_buffer_fast (WavpackStream *wps, int32_t *buffer, int num_samples, unsigned char *destination)
+{
+    uint32_t flags = wps->wphdr.flags, crc = 0xffffffff;
+    unsigned int low = 0, high = 0xffffffff, mult;
+    unsigned short (*summed_probabilities) [256];
+    unsigned char (*probabilities) [256];
+    unsigned char *dp = destination, *ep;
+    int history_bins, bc, p0 = 0, p1 = 0;
+    int total_summed_probabilities = 0;
+    int (*histogram) [256];
+    int32_t *bp = buffer;
+    char history_bits;
+
+    if (!(flags & MONO_DATA))
+        num_samples *= 2;
+
+    if (num_samples < 280)
+        return -1;
+    else if (num_samples < 560)
+        history_bits = 0;
+    else if (num_samples < 1725)
+        history_bits = 1;
+    else if (num_samples < 5000)
+        history_bits = 2;
+    else if (num_samples < 14000)
+        history_bits = 3;
+    else if (num_samples < 28000)
+        history_bits = 4;
+    else if (num_samples < 76000)
+        history_bits = 5;
+    else if (num_samples < 130000)
+        history_bits = 6;
+    else if (num_samples < 300000)
+        history_bits = 7;
+    else
+        history_bits = 8;
+
+    if (history_bits > MAX_HISTORY_BITS)
+        history_bits = MAX_HISTORY_BITS;
+
+    history_bins = 1 << history_bits;
+    histogram = malloc (sizeof (*histogram) * history_bins);
+    memset (histogram, 0, sizeof (*histogram) * history_bins);
+    probabilities = malloc (sizeof (*probabilities) * history_bins);
+    summed_probabilities = malloc (sizeof (*summed_probabilities) * history_bins);
+
+    bc = num_samples;
+
+    if (flags & MONO_DATA)
+        while (bc--) {
+            crc += (crc << 1) + (*bp & 0xff);
+            histogram [p0] [*bp & 0xff]++;
+            p0 = *bp++ & (history_bins-1);
+        }
+    else
+        while (bc--) {
+            crc += (crc << 1) + (*bp & 0xff);
+            histogram [p0] [*bp & 0xff]++;
+            p0 = p1;
+            p1 = *bp++ & (history_bins-1);
+        }
+
+    for (p0 = 0; p0 < history_bins; p0++) {
+        calculate_probabilities (histogram [p0], probabilities [p0], summed_probabilities [p0]);
+        total_summed_probabilities += summed_probabilities [p0] [255];
+    }
+
+    ((WavpackHeader *) wps->blockbuff)->crc = crc;
+
+    // This code detects the case where the required value lookup tables grow silly big and cuts them back down. This would
+    // normally only happen with large blocks or poorly compressible data. The target is to guarantee that the total memory
+    // required for all three decode tables will be 2K bytes per history bin.
+
+    while (total_summed_probabilities > history_bins * 1280) {
+        int max_sum = 0, sum_values = 0, largest_bin = 0;
+
+        for (p0 = 0; p0 < history_bins; ++p0)
+            if (summed_probabilities [p0] [255] > max_sum) {
+                max_sum = summed_probabilities [p0] [255];
+                largest_bin = p0;
+            }
+
+        total_summed_probabilities -= max_sum;
+        p0 = largest_bin;
+
+        for (p1 = 0; p1 < 256; ++p1)
+            summed_probabilities [p0] [p1] = sum_values += probabilities [p0] [p1] = (probabilities [p0] [p1] + 1) >> 1;
+
+        total_summed_probabilities += summed_probabilities [p0] [255];
+        // fprintf (stderr, "processed bin 0x%02x, bin: %d --> %d, new sum = %d\n",
+        //     p0, max_sum, summed_probabilities [p0] [255], total_summed_probabilities);
+    }
+
+    free (histogram);
+    bp = buffer;
+    bc = num_samples;
+    *dp++ = 1;
+    *dp++ = history_bits;
+    *dp++ = MAX_PROBABILITY;
+    ep = destination + num_samples - 10;
+
+#if (MAX_PROBABILITY < 0xff)
+    dp += rle_encode ((unsigned char *) probabilities, sizeof (*probabilities) * history_bins, dp);
+#else
+    memcpy (dp, probabilities, sizeof (*probabilities) * history_bins);
+    dp += sizeof (*probabilities) * history_bins;
+#endif
+
+    p0 = p1 = 0;
+
+    while (dp < ep && bc--) {
+
+        mult = (high - low) / summed_probabilities [p0] [255];
+
+        if (!mult) {
+            high = low;
+
+            while (DSD_BYTE_READY (high, low)) {
+                *dp++ = high >> 24;
+                high = (high << 8) | 0xff;
+                low <<= 8;
+            }
+
+            mult = (high - low) / summed_probabilities [p0] [255];
+        }
+
+        if (*bp & 0xff)
+            low += summed_probabilities [p0] [(*bp & 0xff)-1] * mult;
+
+        high = low + probabilities [p0] [*bp & 0xff] * mult - 1;
+
+        while (DSD_BYTE_READY (high, low)) {
+            *dp++ = high >> 24;
+            high = (high << 8) | 0xff;
+            low <<= 8;
+        }
+
+        if (flags & MONO_DATA)
+            p0 = *bp++ & (history_bins-1);
+        else {
+            p0 = p1;
+            p1 = *bp++ & (history_bins-1);
+        }
+    }
+
+    high = low;
+
+    while (DSD_BYTE_READY (high, low)) {
+        *dp++ = high >> 24;
+        high = (high << 8) | 0xff;
+        low <<= 8;
+    }
+
+    free (summed_probabilities);
+    free (probabilities);
+
+    if (dp < ep)
+        return (int)(dp - destination);
+    else
+        return -1;
+}
+
+/*------------------------------------------------------------------------------------------------------------------------*/
+
+#define PTABLE_BITS 8
+#define PTABLE_BINS (1<<PTABLE_BITS)
+#define PTABLE_MASK (PTABLE_BINS-1)
+
+#define INITIAL_TERM (1536/PTABLE_BINS)
+
+#define UP   0x010000fe
+#define DOWN 0x00010000
+#define DECAY 8
+
+#define PRECISION 20
+#define VALUE_ONE (1 << PRECISION)
+#define PRECISION_USE 12
+
+#define RATE_S 20
+
+static void init_ptable (int *table, int rate_i, int rate_s)
+{
+    int value = 0x808000, rate = rate_i << 8, c, i;
+
+    for (c = (rate + 128) >> 8; c--;)
+        value += (DOWN - value) >> DECAY;
+
+    for (i = 0; i < PTABLE_BINS/2; ++i) {
+        table [i] = value;
+        table [PTABLE_BINS-1-i] = 0x100ffff - value;
+
+        if (value > 0x010000) {
+            rate += (rate * rate_s + 128) >> 8;
+
+            for (c = (rate + 64) >> 7; c--;)
+                value += (DOWN - value) >> DECAY;
+        }
+    }
+}
+
+static int normalize_ptable (int *ptable)
+{
+    int rate = 0, min_error, error_sum, i;
+    int ntable [PTABLE_BINS];
+
+    init_ptable (ntable, rate, RATE_S);
+
+    for (min_error = i = 0; i < PTABLE_BINS; ++i)
+        min_error += abs (ptable [i] - ntable [i]) >> 8;
+
+    while (1) {
+        init_ptable (ntable, ++rate, RATE_S);
+
+        for (error_sum = i = 0; i < PTABLE_BINS; ++i)
+            error_sum += abs (ptable [i] - ntable [i]) >> 8;
+
+        if (error_sum < min_error)
+            min_error = error_sum;
+        else
+            break;
+    }
+
+    return rate - 1;
+}
+
+static int encode_buffer_high (WavpackStream *wps, int32_t *buffer, int num_samples, unsigned char *destination)
+{
+    int channel, stereo = (wps->wphdr.flags & MONO_DATA) ? 0 : 1;
+    uint32_t crc = 0xffffffff, high = 0xffffffff, low = 0;
+    unsigned char *dp = destination, *ep;
+    DSDfilters *sp;
+
+    if (num_samples * (stereo + 1) < 280)
+        return -1;
+
+    *dp++ = 3;
+    ep = destination + num_samples * (stereo + 1) - 10;
+
+    if (!wps->sample_index) {
+        if (!wps->dsd.ptable)
+            wps->dsd.ptable = malloc (PTABLE_BINS * sizeof (*wps->dsd.ptable));
+
+        init_ptable (wps->dsd.ptable, INITIAL_TERM, RATE_S);
+
+        for (channel = 0; channel < 2; ++channel) {
+            sp = wps->dsd.filters + channel;
+
+            sp->filter1 = sp->filter2 = sp->filter3 = sp->filter4 = sp->filter5 = VALUE_ONE / 2;
+            sp->filter6 = sp->factor = 0;
+        }
+
+        *dp++ = INITIAL_TERM;
+        *dp++ = RATE_S;
+    }
+    else {
+        int rate = normalize_ptable (wps->dsd.ptable);
+        init_ptable (wps->dsd.ptable, rate, RATE_S);
+        *dp++ = rate;
+        *dp++ = RATE_S;
+    }
+
+    for (channel = 0; channel <= stereo; ++channel) {
+        sp = wps->dsd.filters + channel;
+
+        *dp = sp->filter1 >> (PRECISION - 8);
+        sp->filter1 = *dp++ << (PRECISION - 8);
+
+        *dp = sp->filter2 >> (PRECISION - 8);
+        sp->filter2 = *dp++ << (PRECISION - 8);
+
+        *dp = sp->filter3 >> (PRECISION - 8);
+        sp->filter3 = *dp++ << (PRECISION - 8);
+
+        *dp = sp->filter4 >> (PRECISION - 8);
+        sp->filter4 = *dp++ << (PRECISION - 8);
+
+        *dp = sp->filter5 >> (PRECISION - 8);
+        sp->filter5 = *dp++ << (PRECISION - 8);
+
+        *dp++ = sp->factor;
+        *dp++ = sp->factor >> 8;
+        sp->filter6 = 0;
+        sp->factor = (sp->factor << 16) >> 16;
+    }
+
+    sp = wps->dsd.filters;
+
+    while (dp < ep && num_samples--) {
+        int bitcount = 8;
+
+        crc += (crc << 1) + (sp->byte = *buffer++ & 0xff);
+        sp [0].value = sp [0].filter1 - sp [0].filter5 + ((sp [0].filter6 * sp [0].factor) >> 2);
+
+        if (stereo) {
+            crc += (crc << 1) + (sp [1].byte = *buffer++ & 0xff);
+            sp [1].value = sp [1].filter1 - sp [1].filter5 + ((sp [1].filter6 * sp [1].factor) >> 2);
+        }
+
+        while (bitcount--) {
+            int32_t *pp = wps->dsd.ptable + ((sp [0].value >> (PRECISION - PRECISION_USE)) & PTABLE_MASK);
+
+            if (sp [0].byte & 0x80) {
+                high = low + ((high - low) >> 8) * (*pp >> 16);
+                *pp += (UP - *pp) >> DECAY;
+                sp [0].filter0 = -1;
+            }
+            else {
+                low += 1 + ((high - low) >> 8) * (*pp >> 16);
+                *pp += (DOWN - *pp) >> DECAY;
+                sp [0].filter0 = 0;
+            }
+
+            while (DSD_BYTE_READY (high, low)) {
+                *dp++ = high >> 24;
+                high = (high << 8) | 0xff;
+                low <<= 8;
+            }
+
+            sp [0].value += sp [0].filter6 << 3;
+            sp [0].factor += (((sp [0].value ^ sp [0].filter0) >> 31) | 1) & ((sp [0].value ^ (sp [0].value - (sp [0].filter6 << 4))) >> 31);
+            sp [0].filter1 += ((sp [0].filter0 & VALUE_ONE) - sp [0].filter1) >> 6;
+            sp [0].filter2 += ((sp [0].filter0 & VALUE_ONE) - sp [0].filter2) >> 4;
+            sp [0].filter3 += (sp [0].filter2 - sp [0].filter3) >> 4;
+            sp [0].filter4 += (sp [0].filter3 - sp [0].filter4) >> 4;
+            sp [0].value = (sp [0].filter4 - sp [0].filter5) >> 4;
+            sp [0].filter5 += sp [0].value;
+            sp [0].filter6 += (sp [0].value - sp [0].filter6) >> 3;
+            sp [0].value = sp [0].filter1 - sp [0].filter5 + ((sp [0].filter6 * sp [0].factor) >> 2);
+            sp [0].byte <<= 1;
+
+            if (!stereo)
+                continue;
+
+            pp = wps->dsd.ptable + ((sp [1].value >> (PRECISION - PRECISION_USE)) & PTABLE_MASK);
+
+            if (sp [1].byte & 0x80) {
+                high = low + ((high - low) >> 8) * (*pp >> 16);
+                *pp += (UP - *pp) >> DECAY;
+                sp [1].filter0 = -1;
+            }
+            else {
+                low += 1 + ((high - low) >> 8) * (*pp >> 16);
+                *pp += (DOWN - *pp) >> DECAY;
+                sp [1].filter0 = 0;
+            }
+
+            while (DSD_BYTE_READY (high, low)) {
+                *dp++ = high >> 24;
+                high = (high << 8) | 0xff;
+                low <<= 8;
+            }
+
+            sp [1].value += sp [1].filter6 << 3;
+            sp [1].factor += (((sp [1].value ^ sp [1].filter0) >> 31) | 1) & ((sp [1].value ^ (sp [1].value - (sp [1].filter6 << 4))) >> 31);
+            sp [1].filter1 += ((sp [1].filter0 & VALUE_ONE) - sp [1].filter1) >> 6;
+            sp [1].filter2 += ((sp [1].filter0 & VALUE_ONE) - sp [1].filter2) >> 4;
+            sp [1].filter3 += (sp [1].filter2 - sp [1].filter3) >> 4;
+            sp [1].filter4 += (sp [1].filter3 - sp [1].filter4) >> 4;
+            sp [1].value = (sp [1].filter4 - sp [1].filter5) >> 4;
+            sp [1].filter5 += sp [1].value;
+            sp [1].filter6 += (sp [1].value - sp [1].filter6) >> 3;
+            sp [1].value = sp [1].filter1 - sp [1].filter5 + ((sp [1].filter6 * sp [1].factor) >> 2);
+            sp [1].byte <<= 1;
+        }
+
+        sp [0].factor -= (sp->factor + 512) >> 10;
+
+        if (stereo)
+            sp [1].factor -= (sp [1].factor + 512) >> 10;
+    }
+
+    ((WavpackHeader *) wps->blockbuff)->crc = crc;
+    high = low;
+
+    while (DSD_BYTE_READY (high, low)) {
+        *dp++ = high >> 24;
+        high = (high << 8) | 0xff;
+        low <<= 8;
+    }
+
+    if (dp < ep)
+        return (int)(dp - destination);
+    else
+        return -1;
+}
+
+#endif      // ENABLE_DSD
diff --git a/third_party/wavpack/src/pack_floats.c b/third_party/wavpack/src/pack_floats.c
new file mode 100644
index 0000000..90ab656
--- /dev/null
+++ b/third_party/wavpack/src/pack_floats.c
@@ -0,0 +1,270 @@
+////////////////////////////////////////////////////////////////////////////
+//                           **** WAVPACK ****                            //
+//                  Hybrid Lossless Wavefile Compressor                   //
+//              Copyright (c) 1998 - 2013 Conifer Software.               //
+//                          All Rights Reserved.                          //
+//      Distributed under the BSD Software License (see license.txt)      //
+////////////////////////////////////////////////////////////////////////////
+
+// pack_floats.c
+
+// This module deals with the compression of floating-point data. Note that no
+// floating point math is involved here...the values are only processed with
+// the macros that directly access the mantissa, exponent, and sign fields.
+// That's why we use the f32 type instead of the built-in float type.
+
+#include <stdlib.h>
+
+#include "wavpack_local.h"
+
+//#define DISPLAY_DIAGNOSTICS
+
+// Scan the provided buffer of floating-point values and (1) convert the
+// significant portion of the data to integers for compression using the
+// regular WavPack algorithms (which only operate on integers) and (2)
+// determine whether the data requires a second stream for lossless
+// storage (which will usually be the case except when the floating-point
+// data was originally integer data). The converted integers are returned
+// "in-place" and a return value of TRUE indicates that a second stream
+// is required.
+
+int scan_float_data (WavpackStream *wps, f32 *values, int32_t num_values)
+{
+    int32_t shifted_ones = 0, shifted_zeros = 0, shifted_both = 0;
+    int32_t false_zeros = 0, neg_zeros = 0;
+#ifdef DISPLAY_DIAGNOSTICS
+    int32_t true_zeros = 0, denormals = 0, exceptions = 0;
+#endif
+    uint32_t ordata = 0, crc = 0xffffffff;
+    int32_t count, value, shift_count;
+    int max_mag = 0, max_exp = 0;
+    f32 *dp;
+
+    wps->float_shift = wps->float_flags = 0;
+
+    // First loop goes through all the data and (1) calculates the CRC and (2) finds the
+    // max magnitude that does not have an exponent of 255 (reserved for +/-inf and NaN).
+    for (dp = values, count = num_values; count--; dp++) {
+        crc = crc * 27 + get_mantissa (*dp) * 9 + get_exponent (*dp) * 3 + get_sign (*dp);
+
+        if (get_exponent (*dp) < 255 && get_magnitude (*dp) > max_mag)
+            max_mag = get_magnitude (*dp);
+    }
+
+    wps->crc_x = crc;
+
+    // round up the magnitude so that when we convert the floating-point values to integers,
+    // they will be (at most) just over 24-bits signed precision
+    if (get_exponent (max_mag))
+        max_exp = get_exponent (max_mag + 0x7F0000);
+
+    for (dp = values, count = num_values; count--; dp++) {
+        // Exponent of 255 is reserved for +/-inf (mantissa = 0) or NaN (mantissa != 0).
+        // we use a value one greater than 24-bits unsigned for this.
+        if (get_exponent (*dp) == 255) {
+#ifdef DISPLAY_DIAGNOSTICS
+            exceptions++;
+#endif
+            wps->float_flags |= FLOAT_EXCEPTIONS;
+            value = 0x1000000;
+            shift_count = 0;
+        }
+        // This is the regular case. We generate a 24-bit unsigned value with the implied
+        // '1' MSB set and calculate a shift that will make it line up with the biggest
+        // samples in this block (although that shift would obviously shift out real data).
+        else if (get_exponent (*dp)) {
+            shift_count = max_exp - get_exponent (*dp);
+            value = 0x800000 + get_mantissa (*dp);
+        }
+        // Zero exponent means either +/- zero (mantissa = 0) or denormals (mantissa != 0).
+        // shift_count is set so that denormals (without an implied '1') will line up with
+        // regular values (with their implied '1' added at bit 23). Trust me. We don't care
+        // about the shift with zero.
+        else {
+            shift_count = max_exp ? max_exp - 1 : 0;
+            value = get_mantissa (*dp);
+
+#ifdef DISPLAY_DIAGNOSTICS
+            if (get_mantissa (*dp))
+                denormals++;
+#endif
+        }
+
+        if (shift_count < 25)
+            value >>= shift_count;      // perform the shift if there could be anything left
+        else
+            value = 0;                  // else just zero the value
+
+        // If we are going to encode an integer zero, then this might be a "false zero" which
+        // means that there are significant bits but they're completely shifted out, or a
+        // "negative zero" which is simply a floating point value that we have to encode
+        // (and converting it to a positive zero would be an error).
+        if (!value) {
+            if (get_exponent (*dp) || get_mantissa (*dp))
+                ++false_zeros;
+            else if (get_sign (*dp))
+                ++neg_zeros;
+#ifdef DISPLAY_DIAGNOSTICS
+            else
+                ++true_zeros;
+#endif
+        }
+        // If we are going to shift something (but not everything) out of our integer before
+        // encoding, then we generate a mask corresponding to the bits that will be shifted
+        // out and increment the counter for the 3 possible cases of (1) all zeros, (2) all
+        // ones, and (3) a mix of ones and zeros.
+        else if (shift_count) {
+            int32_t mask = (1 << shift_count) - 1;
+
+            if (!(get_mantissa (*dp) & mask))
+                shifted_zeros++;
+            else if ((get_mantissa (*dp) & mask) == mask)
+                shifted_ones++;
+            else
+                shifted_both++;
+        }
+
+        // "or" all the integer values together, and store the final integer with applied sign
+
+        ordata |= value;
+        * (int32_t *) dp = (get_sign (*dp)) ? -value : value;
+    }
+
+    wps->float_max_exp = max_exp;   // on decode, we use this to calculate actual exponent
+
+    // Now, based on our various counts, we determine the scheme required to encode the bits
+    // shifted out. Usually these will simply have to be sent literally, but in some rare cases
+    // we can get away with always assuming ones shifted out, or assuming all the bits shifted
+    // out in each value are the same (which means we only have to send a single bit).
+    if (shifted_both)
+        wps->float_flags |= FLOAT_SHIFT_SENT;
+    else if (shifted_ones && !shifted_zeros)
+        wps->float_flags |= FLOAT_SHIFT_ONES;
+    else if (shifted_ones && shifted_zeros)
+        wps->float_flags |= FLOAT_SHIFT_SAME;
+    // Another case is that we only shift out zeros (or maybe nothing), and in that case we
+    // check to see if our data actually has less than 24 or 25 bits of resolution, which means
+    // that we reduce can the magnitude of the integers we are encoding (which saves all those
+    // bits). The number of bits of reduced resolution is stored in float_shift.
+    else if (ordata && !(ordata & 1)) {
+        while (!(ordata & 1)) {
+            wps->float_shift++;
+            ordata >>= 1;
+        }
+
+        // here we shift out all those zeros in the integer data we will encode
+        for (dp = values, count = num_values; count--; dp++)
+            * (int32_t *) dp >>= wps->float_shift;
+    }
+
+    // Here we calculate the actual magnitude used by our integer data, although this is just
+    // used for informational purposes during encode/decode to possibly use faster math.
+
+    wps->wphdr.flags &= ~MAG_MASK;
+
+    while (ordata) {
+        wps->wphdr.flags += 1 << MAG_LSB;
+        ordata >>= 1;
+    }
+
+    // Finally, we have to set some flags that guide how we encode various types of "zeros".
+    // If none of these are set (which is the most common situation), then every integer
+    // zero in the decoded data will simply become a floating-point zero.
+
+    if (false_zeros || neg_zeros)
+        wps->float_flags |= FLOAT_ZEROS_SENT;
+
+    if (neg_zeros)
+        wps->float_flags |= FLOAT_NEG_ZEROS;
+
+#ifdef DISPLAY_DIAGNOSTICS
+    {
+        int32_t *ip, min = 0x7fffffff, max = 0x80000000;
+        for (ip = (int32_t *) values, count = num_values; count--; ip++) {
+            if (*ip < min) min = *ip;
+            if (*ip > max) max = *ip;
+        }
+
+        fprintf (stderr, "integer range = %d to %d\n", min, max);
+    }
+
+    fprintf (stderr, "samples = %d, max exp = %d, pre-shift = %d, denormals = %d, exceptions = %d, max_mag = %x\n",
+        num_values, max_exp, wps->float_shift, denormals, exceptions, max_mag);
+    fprintf (stderr, "shifted ones/zeros/both = %d/%d/%d, true/neg/false zeros = %d/%d/%d\n",
+        shifted_ones, shifted_zeros, shifted_both, true_zeros, neg_zeros, false_zeros);
+#endif
+
+    return wps->float_flags & (FLOAT_EXCEPTIONS | FLOAT_ZEROS_SENT | FLOAT_SHIFT_SENT | FLOAT_SHIFT_SAME);
+}
+
+// Given a buffer of float data, convert the data to integers (which is what the WavPack compression
+// algorithms require) and write the other data required for lossless compression (which includes
+// significant bits shifted out of the integers, plus information about +/- zeros and exceptions
+// like NaN and +/- infinities) into the wvxbits stream (which is assumed to be opened). Note that
+// for this work correctly, scan_float_data() must have been called on the original data to set
+// the appropiate flags in float_flags and max_exp.
+
+void send_float_data (WavpackStream *wps, f32 *values, int32_t num_values)
+{
+    int max_exp = wps->float_max_exp;
+    int32_t count, value, shift_count;
+    f32 *dp;
+
+    for (dp = values, count = num_values; count--; dp++) {
+        if (get_exponent (*dp) == 255) {
+            if (get_mantissa (*dp)) {
+                putbit_1 (&wps->wvxbits);
+                putbits (get_mantissa (*dp), 23, &wps->wvxbits);
+            }
+            else {
+                putbit_0 (&wps->wvxbits);
+            }
+
+            value = 0x1000000;
+            shift_count = 0;
+        }
+        else if (get_exponent (*dp)) {
+            shift_count = max_exp - get_exponent (*dp);
+            value = 0x800000 + get_mantissa (*dp);
+        }
+        else {
+            shift_count = max_exp ? max_exp - 1 : 0;
+            value = get_mantissa (*dp);
+        }
+
+        if (shift_count < 25)
+            value >>= shift_count;
+        else
+            value = 0;
+
+        if (!value) {
+            if (wps->float_flags & FLOAT_ZEROS_SENT) {
+                if (get_exponent (*dp) || get_mantissa (*dp)) {
+                    putbit_1 (&wps->wvxbits);
+                    putbits (get_mantissa (*dp), 23, &wps->wvxbits);
+
+                    if (max_exp >= 25) {
+                        putbits (get_exponent (*dp), 8, &wps->wvxbits);
+                    }
+
+                    putbit (get_sign (*dp), &wps->wvxbits);
+                }
+                else {
+                    putbit_0 (&wps->wvxbits);
+
+                    if (wps->float_flags & FLOAT_NEG_ZEROS)
+                        putbit (get_sign (*dp), &wps->wvxbits);
+                }
+            }
+        }
+        else if (shift_count) {
+            if (wps->float_flags & FLOAT_SHIFT_SENT) {
+                int32_t data = get_mantissa (*dp) & ((1 << shift_count) - 1);
+                putbits (data, shift_count, &wps->wvxbits);
+            }
+            else if (wps->float_flags & FLOAT_SHIFT_SAME) {
+                putbit (get_mantissa (*dp) & 1, &wps->wvxbits);
+            }
+        }
+    }
+}
diff --git a/third_party/wavpack/src/pack_utils.c b/third_party/wavpack/src/pack_utils.c
new file mode 100644
index 0000000..1918c18
--- /dev/null
+++ b/third_party/wavpack/src/pack_utils.c
@@ -0,0 +1,1418 @@
+////////////////////////////////////////////////////////////////////////////
+//                           **** WAVPACK ****                            //
+//                  Hybrid Lossless Wavefile Compressor                   //
+//              Copyright (c) 1998 - 2013 Conifer Software.               //
+//                          All Rights Reserved.                          //
+//      Distributed under the BSD Software License (see license.txt)      //
+////////////////////////////////////////////////////////////////////////////
+
+// pack_utils.c
+
+// This module provides the high-level API for creating WavPack files from
+// audio data. It manages the buffers used to deinterleave the data passed
+// in from the application into the individual streams and it handles the
+// generation of riff headers and the "fixup" on the first WavPack block
+// header for the case where the number of samples was unknown (or wrong).
+// The actual audio stream compression is handled in the pack.c module.
+
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "wavpack_local.h"
+
+///////////////////////////// executable code ////////////////////////////////
+
+// Open context for writing WavPack files. The returned context pointer is used
+// in all following calls to the library. The "blockout" function will be used
+// to store the actual completed WavPack blocks and will be called with the id
+// pointers containing user defined data (one for the wv file and one for the
+// wvc file). A return value of NULL indicates that memory could not be
+// allocated for the context.
+
+WavpackContext *WavpackOpenFileOutput (WavpackBlockOutput blockout, void *wv_id, void *wvc_id)
+{
+    WavpackContext *wpc = malloc (sizeof (WavpackContext));
+
+    if (!wpc)
+        return NULL;
+
+    CLEAR (*wpc);
+    wpc->total_samples = -1;
+    wpc->stream_version = CUR_STREAM_VERS;
+    wpc->blockout = blockout;
+    wpc->wv_out = wv_id;
+    wpc->wvc_out = wvc_id;
+    return wpc;
+}
+
+static int add_to_metadata (WavpackContext *wpc, void *data, uint32_t bcount, unsigned char id);
+
+// New for version 5.0, this function allows the application to store a file extension and a
+// file_format identification. The extension would be used by the unpacker if the user had not
+// specified the target filename, and specifically handles the case where the original file
+// had the "wrong" extension for the file format (e.g., a Wave64 file having a "wav" extension)
+// or an alternative (e.g., "bwf") or where the file format is not known. Specifying a file
+// format besides the default WP_FORMAT_WAV will ensure that old decoders will not be able to
+// see the non-wav wrapper provided with WavpackAddWrapper() (which they would end up putting
+// on a file with a .wav extension).
+
+void WavpackSetFileInformation (WavpackContext *wpc, char *file_extension, unsigned char file_format)
+{
+    if (file_extension && strlen (file_extension) < sizeof (wpc->file_extension)) {
+        add_to_metadata (wpc, file_extension, (uint32_t) strlen (file_extension), ID_ALT_EXTENSION);
+        strcpy (wpc->file_extension, file_extension);
+    }
+
+    wpc->file_format = file_format;
+}
+
+// Set configuration for writing WavPack files. This must be done before
+// sending any actual samples, however it is okay to send wrapper or other
+// metadata before calling this. The "config" structure contains the following
+// required information:
+
+// config->bytes_per_sample     see WavpackGetBytesPerSample() for info
+// config->bits_per_sample      see WavpackGetBitsPerSample() for info
+// config->channel_mask         Microsoft standard (mono = 4, stereo = 3)
+// config->num_channels         self evident
+// config->sample_rate          self evident
+
+// In addition, the following fields and flags may be set:
+
+// config->flags:
+// --------------
+// o CONFIG_HYBRID_FLAG         select hybrid mode (must set bitrate)
+// o CONFIG_JOINT_STEREO        select joint stereo (must set override also)
+// o CONFIG_JOINT_OVERRIDE      override default joint stereo selection
+// o CONFIG_HYBRID_SHAPE        select hybrid noise shaping (set override &
+//                                                      shaping_weight != 0.0)
+// o CONFIG_SHAPE_OVERRIDE      override default hybrid noise shaping
+//                               (set CONFIG_HYBRID_SHAPE and shaping_weight)
+// o CONFIG_FAST_FLAG           "fast" compression mode
+// o CONFIG_HIGH_FLAG           "high" compression mode
+// o CONFIG_BITRATE_KBPS        hybrid bitrate is kbps, not bits / sample
+// o CONFIG_CREATE_WVC          create correction file
+// o CONFIG_OPTIMIZE_WVC        maximize bybrid compression (-cc option)
+// o CONFIG_CALC_NOISE          calc noise in hybrid mode
+// o CONFIG_EXTRA_MODE          extra processing mode (slow!)
+// o CONFIG_SKIP_WVX            no wvx stream for floats & large ints
+// o CONFIG_MD5_CHECKSUM        specify if you plan to store MD5 signature
+// o CONFIG_CREATE_EXE          specify if you plan to prepend sfx module
+// o CONFIG_OPTIMIZE_MONO       detect and optimize for mono files posing as
+//                               stereo (uses a more recent stream format that
+//                               is not compatible with decoders < 4.3)
+
+// config->bitrate              hybrid bitrate in either bits/sample or kbps
+// config->shaping_weight       hybrid noise shaping coefficient override
+// config->block_samples        force samples per WavPack block (0 = use deflt)
+// config->float_norm_exp       select floating-point data (127 for +/-1.0)
+// config->xmode                extra mode processing value override
+
+// If the number of samples to be written is known then it should be passed
+// here. If the duration is not known then pass -1. In the case that the size
+// is not known (or the writing is terminated early) then it is suggested that
+// the application retrieve the first block written and let the library update
+// the total samples indication. A function is provided to do this update and
+// it should be done to the "correction" file also. If this cannot be done
+// (because a pipe is being used, for instance) then a valid WavPack will still
+// be created, but when applications want to access that file they will have
+// to seek all the way to the end to determine the actual duration. Also, if
+// a RIFF header has been included then it should be updated as well or the
+// WavPack file will not be directly unpackable to a valid wav file (although
+// it will still be usable by itself). A return of FALSE indicates an error.
+//
+// The enhanced version of this function now allows setting the identities of
+// any channels that are NOT standard Microsoft channels and are therefore not
+// represented in the channel mask. WavPack files require that all the Microsoft
+// channels come first (and in Microsoft order) and these are followed by any
+// other channels (which can be in any order).
+//
+// The identities are provided in a NULL-terminated string (0x00 is not an allowed
+// channel ID). The Microsoft channels may be provided as well (and will be checked)
+// but it is really only neccessary to provide the "unknown" channels. Any truly
+// unknown channels are indicated with a 0xFF.
+//
+// The channel IDs so far reserved are listed here:
+//
+// 0:           not allowed / terminator
+// 1 - 18:      Microsoft standard channels
+// 30, 31:      Stereo mix from RF64 (not really recommended, but RF64 specifies this)
+// 33 - 44:     Core Audio channels (see Core Audio specification)
+// 127 - 128:   Amio LeftHeight, Amio RightHeight
+// 138 - 142:   Amio BottomFrontLeft/Center/Right, Amio ProximityLeft/Right
+// 200 - 207:   Core Audio channels (see Core Audio specification)
+// 221 - 224:   Core Audio channels 301 - 305 (offset by 80)
+// 255:         Present but unknown or unused channel
+//
+// All other channel IDs are reserved. Ask if something you need is missing.
+
+// Table of channels that will automatically "pair" into a single stereo stream
+
+static const struct { unsigned char a, b; } stereo_pairs [] = {
+    { 1, 2 },       // FL, FR
+    { 5, 6 },       // BL, BR
+    { 7, 8 },       // FLC, FRC
+    { 10, 11 },     // SL, SR
+    { 13, 15 },     // TFL, TFR
+    { 16, 18 },     // TBL, TBR
+    { 30, 31 },     // stereo mix L,R (RF64)
+    { 33, 34 },     // Rls, Rrs
+    { 35, 36 },     // Lw, Rw
+    { 38, 39 },     // Lt, Rt
+    { 127, 128 },   // Lh, Rh
+    { 138, 140 },   // Bfl, Bfr
+    { 141, 142 },   // Pl, Pr
+    { 200, 201 },   // Amb_W, Amb_X
+    { 202, 203 },   // Amb_Y, Amb_Z
+    { 204, 205 },   // MS_Mid, MS_Side
+    { 206, 207 },   // XY_X, XY_Y
+    { 221, 222 },   // Hph_L, Hph_R
+};
+
+#define NUM_STEREO_PAIRS (sizeof (stereo_pairs) / sizeof (stereo_pairs [0]))
+
+// Legacy version of this function for compatibility with existing applications. Note that this version
+// also generates older streams to be compatible with all decoders back to 4.0, but of course cannot be
+// used with > 2^32 samples or non-Microsoft channels. The older stream version only differs in that it
+// does not support the "mono optimization" feature where stereo blocks containing identical audio data
+// in both channels are encoded in mono for better efficiency.
+
+int WavpackSetConfiguration (WavpackContext *wpc, WavpackConfig *config, uint32_t total_samples)
+{
+    config->flags |= CONFIG_COMPATIBLE_WRITE;       // write earlier version streams
+
+    if (total_samples == (uint32_t) -1)
+        return WavpackSetConfiguration64 (wpc, config, -1, NULL);
+    else
+        return WavpackSetConfiguration64 (wpc, config, total_samples, NULL);
+}
+
+int WavpackSetConfiguration64 (WavpackContext *wpc, WavpackConfig *config, int64_t total_samples, const unsigned char *chan_ids)
+{
+    uint32_t flags, bps = 0;
+    uint32_t chan_mask = config->channel_mask;
+    int num_chans = config->num_channels;
+    int i;
+
+    wpc->stream_version = (config->flags & CONFIG_COMPATIBLE_WRITE) ? CUR_STREAM_VERS : MAX_STREAM_VERS;
+
+    if ((config->qmode & QMODE_DSD_AUDIO) && config->bytes_per_sample == 1 && config->bits_per_sample == 8) {
+#ifdef ENABLE_DSD
+        wpc->dsd_multiplier = 1;
+        flags = DSD_FLAG;
+
+        for (i = 14; i >= 0; --i)
+            if (config->sample_rate % sample_rates [i] == 0) {
+                int divisor = config->sample_rate / sample_rates [i];
+
+                if (divisor && (divisor & (divisor - 1)) == 0) {
+                    config->sample_rate /= divisor;
+                    wpc->dsd_multiplier = divisor;
+                    break;
+                }
+            }
+
+        // most options that don't apply to DSD we can simply ignore for now, but NOT hybrid mode!
+        if (config->flags & CONFIG_HYBRID_FLAG) {
+            strcpy (wpc->error_message, "hybrid mode not available for DSD!");
+            return FALSE;
+        }
+
+        // with DSD, very few PCM options work (or make sense), so only allow those that do
+        config->flags &= (CONFIG_HIGH_FLAG | CONFIG_MD5_CHECKSUM | CONFIG_PAIR_UNDEF_CHANS);
+        config->float_norm_exp = config->xmode = 0;
+#else
+        strcpy (wpc->error_message, "libwavpack not configured for DSD!");
+        return FALSE;
+#endif
+    }
+    else
+        flags = config->bytes_per_sample - 1;
+
+    wpc->total_samples = total_samples;
+    wpc->config.sample_rate = config->sample_rate;
+    wpc->config.num_channels = config->num_channels;
+    wpc->config.channel_mask = config->channel_mask;
+    wpc->config.bits_per_sample = config->bits_per_sample;
+    wpc->config.bytes_per_sample = config->bytes_per_sample;
+    wpc->config.block_samples = config->block_samples;
+    wpc->config.flags = config->flags;
+    wpc->config.qmode = config->qmode;
+
+    if (config->flags & CONFIG_VERY_HIGH_FLAG)
+        wpc->config.flags |= CONFIG_HIGH_FLAG;
+
+    for (i = 0; i < 15; ++i)
+        if (wpc->config.sample_rate == sample_rates [i])
+            break;
+
+    flags |= i << SRATE_LSB;
+
+    // all of this stuff only applies to PCM
+
+    if (!(flags & DSD_FLAG)) {
+        if (config->float_norm_exp) {
+            wpc->config.float_norm_exp = config->float_norm_exp;
+            wpc->config.flags |= CONFIG_FLOAT_DATA;
+            flags |= FLOAT_DATA;
+        }
+        else
+            flags |= ((config->bytes_per_sample * 8) - config->bits_per_sample) << SHIFT_LSB;
+
+        if (config->flags & CONFIG_HYBRID_FLAG) {
+            flags |= HYBRID_FLAG | HYBRID_BITRATE | HYBRID_BALANCE;
+
+            if (!(wpc->config.flags & CONFIG_SHAPE_OVERRIDE)) {
+                wpc->config.flags |= CONFIG_HYBRID_SHAPE | CONFIG_AUTO_SHAPING;
+                flags |= HYBRID_SHAPE | NEW_SHAPING;
+            }
+            else if (wpc->config.flags & CONFIG_HYBRID_SHAPE) {
+                wpc->config.shaping_weight = config->shaping_weight;
+                flags |= HYBRID_SHAPE | NEW_SHAPING;
+            }
+
+            if (wpc->config.flags & (CONFIG_CROSS_DECORR | CONFIG_OPTIMIZE_WVC))
+                flags |= CROSS_DECORR;
+
+            if (config->flags & CONFIG_BITRATE_KBPS) {
+                bps = (uint32_t) floor (config->bitrate * 256000.0 / config->sample_rate / config->num_channels + 0.5);
+
+                if (bps > (64 << 8))
+                    bps = 64 << 8;
+            }
+            else
+                bps = (uint32_t) floor (config->bitrate * 256.0 + 0.5);
+        }
+        else
+            flags |= CROSS_DECORR;
+
+        if (!(config->flags & CONFIG_JOINT_OVERRIDE) || (config->flags & CONFIG_JOINT_STEREO))
+            flags |= JOINT_STEREO;
+
+        if (config->flags & CONFIG_CREATE_WVC)
+            wpc->wvc_flag = TRUE;
+    }
+
+    // if a channel-identities string was specified, process that here, otherwise all channels
+    // not present in the channel mask are considered "unassigned"
+
+    if (chan_ids) {
+        int lastchan = 0, mask_copy = chan_mask;
+
+        if ((int) strlen ((char *) chan_ids) > num_chans) {          // can't be more than num channels!
+            strcpy (wpc->error_message, "chan_ids longer than num channels!");
+            return FALSE;
+        }
+
+        // skip past channels that are specified in the channel mask (no reason to store those)
+
+        while (*chan_ids)
+            if (*chan_ids <= 32 && *chan_ids > lastchan && (mask_copy & (1 << (*chan_ids-1)))) {
+                mask_copy &= ~(1 << (*chan_ids-1));
+                lastchan = *chan_ids++;
+            }
+            else
+                break;
+
+        // now scan the string for an actually defined channel (and don't store if there aren't any)
+
+        for (i = 0; chan_ids [i]; i++)
+            if (chan_ids [i] != 0xff) {
+                wpc->channel_identities = (unsigned char *) strdup ((char *) chan_ids);
+                break;
+            }
+    }
+
+    // This loop goes through all the channels and creates the Wavpack "streams" for them to go in.
+    // A stream can hold either one or two channels, so we have several rules to determine how many
+    // channels will go in each stream.
+
+    for (wpc->current_stream = 0; num_chans; wpc->current_stream++) {
+        WavpackStream *wps = malloc (sizeof (WavpackStream));
+        unsigned char left_chan_id = 0, right_chan_id = 0;
+        int pos, chans = 1;
+
+        // allocate the stream and initialize the pointer to it
+        wpc->streams = realloc (wpc->streams, (wpc->current_stream + 1) * sizeof (wpc->streams [0]));
+        wpc->streams [wpc->current_stream] = wps;
+        CLEAR (*wps);
+
+        // if there are any bits [still] set in the channel_mask, get the next one or two IDs from there
+        if (chan_mask)
+            for (pos = 0; pos < 32; ++pos)
+                if (chan_mask & (1 << pos)) {
+                    if (left_chan_id) {
+                        right_chan_id = pos + 1;
+                        break;
+                    }
+                    else {
+                        chan_mask &= ~(1 << pos);
+                        left_chan_id = pos + 1;
+                    }
+                }
+
+        // next check for any channels identified in the channel-identities string
+        while (!right_chan_id && chan_ids && *chan_ids)
+            if (left_chan_id)
+                right_chan_id = *chan_ids;
+            else
+                left_chan_id = *chan_ids++;
+
+        // assume anything we did not get is "unassigned"
+        if (!left_chan_id)
+            left_chan_id = right_chan_id = 0xff;
+        else if (!right_chan_id)
+            right_chan_id = 0xff;
+
+        // if we have 2 channels, this is where we decide if we can combine them into one stream:
+        // 1. they are "unassigned" and we've been told to combine unassigned pairs, or
+        // 2. they appear together in the valid "pairings" list
+        if (num_chans >= 2) {
+            if ((config->flags & CONFIG_PAIR_UNDEF_CHANS) && left_chan_id == 0xff && right_chan_id == 0xff)
+                chans = 2;
+            else
+                for (i = 0; i < NUM_STEREO_PAIRS; ++i)
+                    if ((left_chan_id == stereo_pairs [i].a && right_chan_id == stereo_pairs [i].b) ||
+                        (left_chan_id == stereo_pairs [i].b && right_chan_id == stereo_pairs [i].a)) {
+                            if (right_chan_id <= 32 && (chan_mask & (1 << (right_chan_id-1))))
+                                chan_mask &= ~(1 << (right_chan_id-1));
+                            else if (chan_ids && *chan_ids == right_chan_id)
+                                chan_ids++;
+
+                            chans = 2;
+                            break;
+                        }
+        }
+
+        num_chans -= chans;
+
+        if (num_chans && wpc->current_stream == NEW_MAX_STREAMS - 1)
+            break;
+
+        memcpy (wps->wphdr.ckID, "wvpk", 4);
+        wps->wphdr.ckSize = sizeof (WavpackHeader) - 8;
+        SET_TOTAL_SAMPLES (wps->wphdr, wpc->total_samples);
+        wps->wphdr.version = wpc->stream_version;
+        wps->wphdr.flags = flags;
+        wps->bits = bps;
+
+        if (!wpc->current_stream)
+            wps->wphdr.flags |= INITIAL_BLOCK;
+
+        if (!num_chans)
+            wps->wphdr.flags |= FINAL_BLOCK;
+
+        if (chans == 1) {
+            wps->wphdr.flags &= ~(JOINT_STEREO | CROSS_DECORR | HYBRID_BALANCE);
+            wps->wphdr.flags |= MONO_FLAG;
+        }
+    }
+
+    wpc->num_streams = wpc->current_stream;
+    wpc->current_stream = 0;
+
+    if (num_chans) {
+        strcpy (wpc->error_message, "too many channels!");
+        return FALSE;
+    }
+
+    if (config->flags & CONFIG_EXTRA_MODE)
+        wpc->config.xmode = config->xmode ? config->xmode : 1;
+
+    return TRUE;
+}
+
+// This function allows setting the Core Audio File channel layout, many of which do not
+// conform to the Microsoft ordering standard that Wavpack requires internally (at least for
+// those channels present in the "channel mask"). In addition to the layout tag, this function
+// allows a reordering string to be stored in the file to allow the unpacker to reorder the
+// channels back to the specified layout (if it is aware of this feature and wants to restore
+// the CAF order). The number of channels in the layout is specified in the lower nybble of
+// the layout word, and if a reorder string is specified it must be that long. Note that all
+// the reordering is actually done outside of this library, and that if reordering is done
+// then the appropriate qmode bit must be set to ensure that any MD5 sum is stored with a new
+// ID so that old decoders don't try to verify it (and to let the decoder know that a reorder
+// might be required).
+//
+// Note: This function should only be used to encode Core Audio files in such a way that a
+// verbatim archive can be created. Applications can just include the chan_ids parameter in
+// the call to WavpackSetConfiguration64() if there are non-Microsoft channels to specify,
+// or do nothing special if only Microsoft channels are present (the vast majority of cases).
+
+int WavpackSetChannelLayout (WavpackContext *wpc, uint32_t layout_tag, const unsigned char *reorder)
+{
+    int nchans = layout_tag & 0xff;
+
+    if ((layout_tag & 0xff00ff00) || nchans > wpc->config.num_channels)
+        return FALSE;
+
+    wpc->channel_layout = layout_tag;
+
+    if (wpc->channel_reordering) {
+        free (wpc->channel_reordering);
+        wpc->channel_reordering = NULL;
+    }
+
+    if (nchans && reorder) {
+        int min_index = 256, i;
+
+        for (i = 0; i < nchans; ++i)
+            if (reorder [i] < min_index)
+                min_index = reorder [i];
+
+        wpc->channel_reordering = malloc (nchans);
+
+        if (wpc->channel_reordering)
+            for (i = 0; i < nchans; ++i)
+                wpc->channel_reordering [i] = reorder [i] - min_index;
+    }
+
+    return TRUE;
+}
+
+// Prepare to actually pack samples by determining the size of the WavPack
+// blocks and allocating sample buffers and initializing each stream. Call
+// after WavpackSetConfiguration() and before WavpackPackSamples(). A return
+// of FALSE indicates an error.
+
+static int write_metadata_block (WavpackContext *wpc);
+
+int WavpackPackInit (WavpackContext *wpc)
+{
+    if (wpc->metabytes > 16384)             // 16384 bytes still leaves plenty of room for audio
+        write_metadata_block (wpc);         //  in this block (otherwise write a special one)
+
+    // The default block size is a compromise. Longer blocks provide better encoding efficiency,
+    // but longer blocks adversely affect memory requirements and seeking performance. For WavPack
+    // version 5.0, the default block sizes have been reduced by half from the previous version,
+    // but the difference in encoding efficiency will generally be less than 0.1 percent.
+
+    if (wpc->dsd_multiplier) {
+        wpc->block_samples = (wpc->config.sample_rate % 7) ? 48000 : 44100;
+
+        if (wpc->config.flags & CONFIG_HIGH_FLAG)
+            wpc->block_samples /= 2;
+
+        if (wpc->config.num_channels == 1)
+            wpc->block_samples *= 2;
+
+        while (wpc->block_samples > 12000 && wpc->block_samples * wpc->config.num_channels > 300000)
+            wpc->block_samples /= 2;
+    }
+    else {
+        int divisor = (wpc->config.flags & CONFIG_HIGH_FLAG) ? 2 : 4;
+
+        while (wpc->config.sample_rate % divisor)
+            divisor--;
+
+        wpc->block_samples = wpc->config.sample_rate / divisor;
+
+        while (wpc->block_samples > 12000 && wpc->block_samples * wpc->config.num_channels > 75000)
+            wpc->block_samples /= 2;
+
+        while (wpc->block_samples * wpc->config.num_channels < 20000)
+            wpc->block_samples *= 2;
+    }
+
+    if (wpc->config.block_samples) {
+        if ((wpc->config.flags & CONFIG_MERGE_BLOCKS) &&
+            wpc->block_samples > (uint32_t) wpc->config.block_samples) {
+                wpc->block_boundary = wpc->config.block_samples;
+                wpc->block_samples /= wpc->config.block_samples;
+                wpc->block_samples *= wpc->config.block_samples;
+        }
+        else
+            wpc->block_samples = wpc->config.block_samples;
+    }
+
+    wpc->ave_block_samples = wpc->block_samples;
+    wpc->max_samples = wpc->block_samples + (wpc->block_samples >> 1);
+
+    for (wpc->current_stream = 0; wpc->current_stream < wpc->num_streams; wpc->current_stream++) {
+        WavpackStream *wps = wpc->streams [wpc->current_stream];
+
+        wps->sample_buffer = malloc (wpc->max_samples * (wps->wphdr.flags & MONO_FLAG ? 4 : 8));
+
+#ifdef ENABLE_DSD
+        if (wps->wphdr.flags & DSD_FLAG)
+            pack_dsd_init (wpc);
+        else
+#endif
+            pack_init (wpc);
+    }
+
+    return TRUE;
+}
+
+// Pack the specified samples. Samples must be stored in longs in the native
+// endian format of the executing processor. The number of samples specified
+// indicates composite samples (sometimes called "frames"). So, the actual
+// number of data points would be this "sample_count" times the number of
+// channels. Note that samples are accumulated here until enough exist to
+// create a complete WavPack block (or several blocks for multichannel audio).
+// If an application wants to break a block at a specific sample, then it must
+// simply call WavpackFlushSamples() to force an early termination. Completed
+// WavPack blocks are send to the function provided in the initial call to
+// WavpackOpenFileOutput(). A return of FALSE indicates an error.
+
+static int pack_streams (WavpackContext *wpc, uint32_t block_samples);
+static int create_riff_header (WavpackContext *wpc, int64_t total_samples, void *outbuffer);
+
+int WavpackPackSamples (WavpackContext *wpc, int32_t *sample_buffer, uint32_t sample_count)
+{
+    int nch = wpc->config.num_channels;
+
+    while (sample_count) {
+        int32_t *source_pointer = sample_buffer;
+        unsigned int samples_to_copy;
+
+        if (!wpc->riff_header_added && !wpc->riff_header_created && !wpc->file_format) {
+            char riff_header [128];
+
+            if (!add_to_metadata (wpc, riff_header, create_riff_header (wpc, wpc->total_samples, riff_header), ID_RIFF_HEADER))
+                return FALSE;
+        }
+
+        if (wpc->acc_samples + sample_count > wpc->max_samples)
+            samples_to_copy = wpc->max_samples - wpc->acc_samples;
+        else
+            samples_to_copy = sample_count;
+
+        for (wpc->current_stream = 0; wpc->current_stream < wpc->num_streams; wpc->current_stream++) {
+            WavpackStream *wps = wpc->streams [wpc->current_stream];
+            int32_t *dptr, *sptr, cnt;
+
+            dptr = wps->sample_buffer + wpc->acc_samples * (wps->wphdr.flags & MONO_FLAG ? 1 : 2);
+            sptr = source_pointer;
+            cnt = samples_to_copy;
+
+            // This code used to just copy the 32-bit samples regardless of the actual size with the
+            // assumption that the caller had properly sign-extended the values (if they were smaller
+            // than 32 bits). However, several people have discovered that if the data isn't properly
+            // sign extended then ugly things happen (e.g. CRC errors that show up only on decode).
+            // To prevent this, we now explicitly sign-extend samples smaller than 32-bit when we
+            // copy, and the performance hit from doing this is very small (generally < 1%).
+
+            if (wps->wphdr.flags & MONO_FLAG) {
+                switch (wpc->config.bytes_per_sample) {
+                    case 1:
+                        while (cnt--) {
+                            *dptr++ = (signed char) *sptr;
+                            sptr += nch;
+                        }
+
+                        break;
+
+                    case 2:
+                        while (cnt--) {
+                            *dptr++ = (int16_t) *sptr;
+                            sptr += nch;
+                        }
+
+                        break;
+
+                    case 3:
+                        while (cnt--) {
+                            *dptr++ = (*sptr << 8) >> 8;
+                            sptr += nch;
+                        }
+
+                        break;
+
+                    default:
+                        while (cnt--) {
+                            *dptr++ = *sptr;
+                            sptr += nch;
+                        }
+                }
+
+                source_pointer++;
+            }
+            else {
+                switch (wpc->config.bytes_per_sample) {
+                    case 1:
+                        while (cnt--) {
+                            *dptr++ = (signed char) sptr [0];
+                            *dptr++ = (signed char) sptr [1];
+                            sptr += nch;
+                        }
+
+                        break;
+
+                    case 2:
+                        while (cnt--) {
+                            *dptr++ = (int16_t) sptr [0];
+                            *dptr++ = (int16_t) sptr [1];
+                            sptr += nch;
+                        }
+
+                        break;
+
+                    case 3:
+                        while (cnt--) {
+                            *dptr++ = (sptr [0] << 8) >> 8;
+                            *dptr++ = (sptr [1] << 8) >> 8;
+                            sptr += nch;
+                        }
+
+                        break;
+
+                    default:
+                        while (cnt--) {
+                            *dptr++ = sptr [0];
+                            *dptr++ = sptr [1];
+                            sptr += nch;
+                        }
+                }
+
+                source_pointer += 2;
+            }
+        }
+
+        sample_buffer += samples_to_copy * nch;
+        sample_count -= samples_to_copy;
+
+        if ((wpc->acc_samples += samples_to_copy) == wpc->max_samples &&
+            !pack_streams (wpc, wpc->block_samples))
+                return FALSE;
+    }
+
+    return TRUE;
+}
+
+// Flush all accumulated samples into WavPack blocks. This is normally called
+// after all samples have been sent to WavpackPackSamples(), but can also be
+// called to terminate a WavPack block at a specific sample (in other words it
+// is possible to continue after this operation). This is also called to
+// dump non-audio blocks like those holding metadata for various purposes.
+// A return of FALSE indicates an error.
+
+int WavpackFlushSamples (WavpackContext *wpc)
+{
+    while (wpc->acc_samples) {
+        uint32_t block_samples;
+
+        if (wpc->acc_samples > wpc->block_samples)
+            block_samples = wpc->acc_samples / 2;
+        else
+            block_samples = wpc->acc_samples;
+
+        if (!pack_streams (wpc, block_samples))
+            return FALSE;
+    }
+
+    if (wpc->metacount)
+        write_metadata_block (wpc);
+
+    return TRUE;
+}
+
+// Note: The following function is no longer required because a proper wav
+// header is now automatically generated for the application. However, if the
+// application wants to generate its own header or wants to include additional
+// chunks, then this function can still be used in which case the automatic
+// wav header generation is suppressed.
+
+// Add wrapper (currently RIFF only) to WavPack blocks. This should be called
+// before sending any audio samples for the RIFF header or after all samples
+// have been sent for any RIFF trailer. WavpackFlushSamples() should be called
+// between sending the last samples and calling this for trailer data to make
+// sure that headers and trailers don't get mixed up in very short files. If
+// the exact contents of the RIFF header are not known because, for example,
+// the file duration is uncertain or trailing chunks are possible, simply write
+// a "dummy" header of the correct length. When all data has been written it
+// will be possible to read the first block written and update the header
+// directly. An example of this can be found in the Audition filter. A
+// return of FALSE indicates an error.
+
+int WavpackAddWrapper (WavpackContext *wpc, void *data, uint32_t bcount)
+{
+    int64_t index = WavpackGetSampleIndex64 (wpc);
+    unsigned char meta_id;
+
+    if (!index || index == -1) {
+        wpc->riff_header_added = TRUE;
+        meta_id = wpc->file_format ? ID_ALT_HEADER : ID_RIFF_HEADER;
+    }
+    else {
+        wpc->riff_trailer_bytes += bcount;
+        meta_id = wpc->file_format ? ID_ALT_TRAILER : ID_RIFF_TRAILER;
+    }
+
+    return add_to_metadata (wpc, data, bcount, meta_id);
+}
+
+// Store computed MD5 sum in WavPack metadata. Note that the user must compute
+// the 16 byte sum; it is not done here. A return of FALSE indicates an error.
+// If any of the lower 8 bits of qmode are set, then this MD5 is stored with
+// a metadata ID that old decoders do not recognize (because they would not
+// interpret the qmode and would therefore fail the verification).
+
+int WavpackStoreMD5Sum (WavpackContext *wpc, unsigned char data [16])
+{
+    return add_to_metadata (wpc, data, 16, (wpc->config.qmode & 0xff) ? ID_ALT_MD5_CHECKSUM : ID_MD5_CHECKSUM);
+}
+
+#pragma pack(push,4)
+
+typedef struct {
+    char ckID [4];
+    uint64_t chunkSize64;
+} CS64Chunk;
+
+typedef struct {
+    uint64_t riffSize64, dataSize64, sampleCount64;
+    uint32_t tableLength;
+} DS64Chunk;
+
+typedef struct {
+    char ckID [4];
+    uint32_t ckSize;
+    char junk [28];
+} JunkChunk;
+
+#pragma pack(pop)
+
+#define DS64ChunkFormat "DDDL"
+
+static int create_riff_header (WavpackContext *wpc, int64_t total_samples, void *outbuffer)
+{
+    int do_rf64 = 0, write_junk = 1;
+    ChunkHeader ds64hdr, datahdr, fmthdr;
+    char *outptr = outbuffer;
+    RiffChunkHeader riffhdr;
+    DS64Chunk ds64_chunk;
+    JunkChunk junkchunk;
+    WaveHeader wavhdr;
+
+    int64_t total_data_bytes, total_riff_bytes;
+    int32_t channel_mask = wpc->config.channel_mask;
+    int32_t sample_rate = wpc->config.sample_rate;
+    int bytes_per_sample = wpc->config.bytes_per_sample;
+    int bits_per_sample = wpc->config.bits_per_sample;
+    int format = (wpc->config.float_norm_exp) ? 3 : 1;
+    int num_channels = wpc->config.num_channels;
+    int wavhdrsize = 16;
+
+    wpc->riff_header_created = TRUE;
+
+    if (format == 3 && wpc->config.float_norm_exp != 127) {
+        strcpy (wpc->error_message, "can't create valid RIFF wav header for non-normalized floating data!");
+        return FALSE;
+    }
+
+    if (total_samples == -1)
+        total_samples = 0x7ffff000 / (bytes_per_sample * num_channels);
+
+    total_data_bytes = total_samples * bytes_per_sample * num_channels;
+
+    if (total_data_bytes > 0xff000000) {
+        write_junk = 0;
+        do_rf64 = 1;
+    }
+
+    CLEAR (wavhdr);
+
+    wavhdr.FormatTag = format;
+    wavhdr.NumChannels = num_channels;
+    wavhdr.SampleRate = sample_rate;
+    wavhdr.BytesPerSecond = sample_rate * num_channels * bytes_per_sample;
+    wavhdr.BlockAlign = bytes_per_sample * num_channels;
+    wavhdr.BitsPerSample = bits_per_sample;
+
+    if (num_channels > 2 || channel_mask != 0x5 - num_channels) {
+        wavhdrsize = sizeof (wavhdr);
+        wavhdr.cbSize = 22;
+        wavhdr.ValidBitsPerSample = bits_per_sample;
+        wavhdr.SubFormat = format;
+        wavhdr.ChannelMask = channel_mask;
+        wavhdr.FormatTag = 0xfffe;
+        wavhdr.BitsPerSample = bytes_per_sample * 8;
+        wavhdr.GUID [4] = 0x10;
+        wavhdr.GUID [6] = 0x80;
+        wavhdr.GUID [9] = 0xaa;
+        wavhdr.GUID [11] = 0x38;
+        wavhdr.GUID [12] = 0x9b;
+        wavhdr.GUID [13] = 0x71;
+    }
+
+    strncpy (riffhdr.ckID, do_rf64 ? "RF64" : "RIFF", sizeof (riffhdr.ckID));
+    strncpy (riffhdr.formType, "WAVE", sizeof (riffhdr.formType));
+    total_riff_bytes = sizeof (riffhdr) + wavhdrsize + sizeof (datahdr) + total_data_bytes + wpc->riff_trailer_bytes;
+    if (do_rf64) total_riff_bytes += sizeof (ds64hdr) + sizeof (ds64_chunk);
+    if (write_junk) total_riff_bytes += sizeof (junkchunk);
+    strncpy (fmthdr.ckID, "fmt ", sizeof (fmthdr.ckID));
+    strncpy (datahdr.ckID, "data", sizeof (datahdr.ckID));
+    fmthdr.ckSize = wavhdrsize;
+
+    if (write_junk) {
+        CLEAR (junkchunk);
+        strncpy (junkchunk.ckID, "junk", sizeof (junkchunk.ckID));
+        junkchunk.ckSize = sizeof (junkchunk) - 8;
+        WavpackNativeToLittleEndian (&junkchunk, ChunkHeaderFormat);
+    }
+
+    if (do_rf64) {
+        strncpy (ds64hdr.ckID, "ds64", sizeof (ds64hdr.ckID));
+        ds64hdr.ckSize = sizeof (ds64_chunk);
+        CLEAR (ds64_chunk);
+        ds64_chunk.riffSize64 = total_riff_bytes;
+        ds64_chunk.dataSize64 = total_data_bytes;
+        ds64_chunk.sampleCount64 = total_samples;
+        riffhdr.ckSize = (uint32_t) -1;
+        datahdr.ckSize = (uint32_t) -1;
+        WavpackNativeToLittleEndian (&ds64hdr, ChunkHeaderFormat);
+        WavpackNativeToLittleEndian (&ds64_chunk, DS64ChunkFormat);
+    }
+    else {
+        riffhdr.ckSize = (uint32_t) total_riff_bytes;
+        datahdr.ckSize = (uint32_t) total_data_bytes;
+    }
+
+    WavpackNativeToLittleEndian (&riffhdr, ChunkHeaderFormat);
+    WavpackNativeToLittleEndian (&fmthdr, ChunkHeaderFormat);
+    WavpackNativeToLittleEndian (&wavhdr, WaveHeaderFormat);
+    WavpackNativeToLittleEndian (&datahdr, ChunkHeaderFormat);
+
+    // write the RIFF chunks up to just before the data starts
+
+    outptr = (char *) memcpy (outptr, &riffhdr, sizeof (riffhdr)) + sizeof (riffhdr);
+
+    if (do_rf64) {
+        outptr = (char *) memcpy (outptr, &ds64hdr, sizeof (ds64hdr)) + sizeof (ds64hdr);
+        outptr = (char *) memcpy (outptr, &ds64_chunk, sizeof (ds64_chunk)) + sizeof (ds64_chunk);
+    }
+
+    if (write_junk)
+        outptr = (char *) memcpy (outptr, &junkchunk, sizeof (junkchunk)) + sizeof (junkchunk);
+
+    outptr = (char *) memcpy (outptr, &fmthdr, sizeof (fmthdr)) + sizeof (fmthdr);
+    outptr = (char *) memcpy (outptr, &wavhdr, wavhdrsize) + wavhdrsize;
+    outptr = (char *) memcpy (outptr, &datahdr, sizeof (datahdr)) + sizeof (datahdr);
+
+    return (int)(outptr - (char *) outbuffer);
+}
+
+static int block_add_checksum (unsigned char *buffer_start, unsigned char *buffer_end, int bytes);
+
+static int pack_streams (WavpackContext *wpc, uint32_t block_samples)
+{
+    uint32_t max_blocksize, max_chans = 1, bcount;
+    unsigned char *outbuff, *outend, *out2buff, *out2end;
+    int result = TRUE, i;
+
+    // for calculating output (block) buffer size, first see if any streams are stereo
+
+    for (i = 0; i < wpc->num_streams; i++)
+        if (!(wpc->streams [i]->wphdr.flags & MONO_FLAG)) {
+            max_chans = 2;
+            break;
+        }
+
+    // then calculate maximum size based on bytes / sample
+
+    max_blocksize = block_samples * max_chans * ((wpc->streams [0]->wphdr.flags & BYTES_STORED) + 1);
+
+    // add margin based on how much "negative" compression is possible with pathological audio
+
+    if ((wpc->config.flags & CONFIG_FLOAT_DATA) && !(wpc->config.flags & CONFIG_SKIP_WVX))
+        max_blocksize += max_blocksize;         // 100% margin for lossless float data
+    else
+        max_blocksize += max_blocksize >> 2;    // otherwise 25% margin for everything else
+
+    max_blocksize += wpc->metabytes + 1024;     // finally, add metadata & another 1K margin
+
+    out2buff = (wpc->wvc_flag) ? malloc (max_blocksize) : NULL;
+    out2end = out2buff + max_blocksize;
+    outbuff = malloc (max_blocksize);
+    outend = outbuff + max_blocksize;
+
+    for (wpc->current_stream = 0; wpc->current_stream < wpc->num_streams; wpc->current_stream++) {
+        WavpackStream *wps = wpc->streams [wpc->current_stream];
+        uint32_t flags = wps->wphdr.flags;
+
+        flags &= ~MAG_MASK;
+        flags += (1 << MAG_LSB) * ((flags & BYTES_STORED) * 8 + 7);
+
+        SET_BLOCK_INDEX (wps->wphdr, wps->sample_index);
+        wps->wphdr.block_samples = block_samples;
+        wps->wphdr.flags = flags;
+        wps->block2buff = out2buff;
+        wps->block2end = out2end;
+        wps->blockbuff = outbuff;
+        wps->blockend = outend;
+
+#ifdef ENABLE_DSD
+        if (flags & DSD_FLAG)
+            result = pack_dsd_block (wpc, wps->sample_buffer);
+        else
+#endif
+            result = pack_block (wpc, wps->sample_buffer);
+
+        if (result) {
+            result = block_add_checksum (outbuff, outend, (flags & HYBRID_FLAG) ? 2 : 4);
+
+            if (result && out2buff)
+                result = block_add_checksum (out2buff, out2end, 2);
+        }
+
+        wps->blockbuff = wps->block2buff = NULL;
+
+        if (wps->wphdr.block_samples != block_samples)
+            block_samples = wps->wphdr.block_samples;
+
+        if (!result) {
+            strcpy (wpc->error_message, "output buffer overflowed!");
+            break;
+        }
+
+        bcount = ((WavpackHeader *) outbuff)->ckSize + 8;
+        WavpackNativeToLittleEndian ((WavpackHeader *) outbuff, WavpackHeaderFormat);
+        result = wpc->blockout (wpc->wv_out, outbuff, bcount);
+
+        if (!result) {
+            strcpy (wpc->error_message, "can't write WavPack data, disk probably full!");
+            break;
+        }
+
+        wpc->filelen += bcount;
+
+        if (out2buff) {
+            bcount = ((WavpackHeader *) out2buff)->ckSize + 8;
+            WavpackNativeToLittleEndian ((WavpackHeader *) out2buff, WavpackHeaderFormat);
+            result = wpc->blockout (wpc->wvc_out, out2buff, bcount);
+
+            if (!result) {
+                strcpy (wpc->error_message, "can't write WavPack data, disk probably full!");
+                break;
+            }
+
+            wpc->file2len += bcount;
+        }
+
+        if (wpc->acc_samples != block_samples)
+            memmove (wps->sample_buffer, wps->sample_buffer + block_samples * (flags & MONO_FLAG ? 1 : 2),
+                (wpc->acc_samples - block_samples) * sizeof (int32_t) * (flags & MONO_FLAG ? 1 : 2));
+    }
+
+    wpc->current_stream = 0;
+    wpc->ave_block_samples = (wpc->ave_block_samples * 0x7 + block_samples + 0x4) >> 3;
+    wpc->acc_samples -= block_samples;
+    free (outbuff);
+
+    if (out2buff)
+        free (out2buff);
+
+    return result;
+}
+
+// Given the pointer to the first block written (to either a .wv or .wvc file),
+// update the block with the actual number of samples written. If the wav
+// header was generated by the library, then it is updated also. This should
+// be done if WavpackSetConfiguration() was called with an incorrect number
+// of samples (or -1). It is the responsibility of the application to read and
+// rewrite the block. An example of this can be found in the Audition filter.
+
+static void block_update_checksum (unsigned char *buffer_start);
+
+void WavpackUpdateNumSamples (WavpackContext *wpc, void *first_block)
+{
+    uint32_t wrapper_size;
+
+    WavpackLittleEndianToNative (first_block, WavpackHeaderFormat);
+    SET_TOTAL_SAMPLES (* (WavpackHeader *) first_block, WavpackGetSampleIndex64 (wpc));
+
+    if (wpc->riff_header_created && WavpackGetWrapperLocation (first_block, &wrapper_size)) {
+        unsigned char riff_header [128];
+
+        if (wrapper_size == create_riff_header (wpc, WavpackGetSampleIndex64 (wpc), riff_header))
+            memcpy (WavpackGetWrapperLocation (first_block, NULL), riff_header, wrapper_size);
+    }
+
+    block_update_checksum (first_block);
+    WavpackNativeToLittleEndian (first_block, WavpackHeaderFormat);
+}
+
+// Note: The following function is no longer required because the wav header
+// automatically generated for the application will also be updated by
+// WavpackUpdateNumSamples (). However, if the application wants to generate
+// its own header or wants to include additional chunks, then this function
+// still must be used to update the application generated header.
+
+// Given the pointer to the first block written to a WavPack file, this
+// function returns the location of the stored RIFF header that was originally
+// written with WavpackAddWrapper(). This would normally be used to update
+// the wav header to indicate that a different number of samples was actually
+// written or if additional RIFF chunks are written at the end of the file.
+// The "size" parameter can be set to non-NULL to obtain the exact size of the
+// RIFF header, and the function will return FALSE if the header is not found
+// in the block's metadata (or it is not a valid WavPack block). It is the
+// responsibility of the application to read and rewrite the block. An example
+// of this can be found in the Audition filter.
+
+static void *find_metadata (void *wavpack_block, int desired_id, uint32_t *size);
+
+void *WavpackGetWrapperLocation (void *first_block, uint32_t *size)
+{
+    void *loc;
+
+    WavpackLittleEndianToNative (first_block, WavpackHeaderFormat);
+    loc = find_metadata (first_block, ID_RIFF_HEADER, size);
+
+    if (!loc)
+        loc = find_metadata (first_block, ID_ALT_HEADER, size);
+
+    WavpackNativeToLittleEndian (first_block, WavpackHeaderFormat);
+
+    return loc;
+}
+
+static void *find_metadata (void *wavpack_block, int desired_id, uint32_t *size)
+{
+    WavpackHeader *wphdr = wavpack_block;
+    unsigned char *dp, meta_id, c1, c2;
+    int32_t bcount, meta_bc;
+
+    if (strncmp (wphdr->ckID, "wvpk", 4))
+        return NULL;
+
+    bcount = wphdr->ckSize - sizeof (WavpackHeader) + 8;
+    dp = (unsigned char *)(wphdr + 1);
+
+    while (bcount >= 2) {
+        meta_id = *dp++;
+        c1 = *dp++;
+
+        meta_bc = c1 << 1;
+        bcount -= 2;
+
+        if (meta_id & ID_LARGE) {
+            if (bcount < 2)
+                break;
+
+            c1 = *dp++;
+            c2 = *dp++;
+            meta_bc += ((uint32_t) c1 << 9) + ((uint32_t) c2 << 17);
+            bcount -= 2;
+        }
+
+        if ((meta_id & ID_UNIQUE) == desired_id) {
+            if ((bcount - meta_bc) >= 0) {
+                if (size)
+                    *size = meta_bc - ((meta_id & ID_ODD_SIZE) ? 1 : 0);
+
+                return dp;
+            }
+            else
+                return NULL;
+        }
+
+        bcount -= meta_bc;
+        dp += meta_bc;
+    }
+
+    return NULL;
+}
+
+int copy_metadata (WavpackMetadata *wpmd, unsigned char *buffer_start, unsigned char *buffer_end)
+{
+    uint32_t mdsize = wpmd->byte_length + (wpmd->byte_length & 1);
+    WavpackHeader *wphdr = (WavpackHeader *) buffer_start;
+
+    mdsize += (wpmd->byte_length > 510) ? 4 : 2;
+    buffer_start += wphdr->ckSize + 8;
+
+    if (buffer_start + mdsize >= buffer_end)
+        return FALSE;
+
+    buffer_start [0] = wpmd->id | (wpmd->byte_length & 1 ? ID_ODD_SIZE : 0);
+    buffer_start [1] = (wpmd->byte_length + 1) >> 1;
+
+    if (wpmd->byte_length > 510) {
+        buffer_start [0] |= ID_LARGE;
+        buffer_start [2] = (wpmd->byte_length + 1) >> 9;
+        buffer_start [3] = (wpmd->byte_length + 1) >> 17;
+    }
+
+    if (wpmd->data && wpmd->byte_length) {
+        memcpy (buffer_start + (wpmd->byte_length > 510 ? 4 : 2), wpmd->data, wpmd->byte_length);
+
+        if (wpmd->byte_length & 1)          // if size is odd, make sure pad byte is a zero
+            buffer_start [mdsize - 1] = 0;
+    }
+
+    wphdr->ckSize += mdsize;
+    return TRUE;
+}
+
+static int add_to_metadata (WavpackContext *wpc, void *data, uint32_t bcount, unsigned char id)
+{
+    WavpackMetadata *mdp;
+    unsigned char *src = data;
+
+    while (bcount) {
+        if (wpc->metacount) {
+            uint32_t bc = bcount;
+
+            mdp = wpc->metadata + wpc->metacount - 1;
+
+            if (mdp->id == id) {
+                if (wpc->metabytes + bcount > 1000000)
+                    bc = 1000000 - wpc->metabytes;
+
+                mdp->data = realloc (mdp->data, mdp->byte_length + bc);
+                memcpy ((char *) mdp->data + mdp->byte_length, src, bc);
+                mdp->byte_length += bc;
+                wpc->metabytes += bc;
+                bcount -= bc;
+                src += bc;
+
+                if (wpc->metabytes >= 1000000 && !write_metadata_block (wpc))
+                    return FALSE;
+            }
+        }
+
+        if (bcount) {
+            wpc->metadata = realloc (wpc->metadata, (wpc->metacount + 1) * sizeof (WavpackMetadata));
+            mdp = wpc->metadata + wpc->metacount++;
+            mdp->byte_length = 0;
+            mdp->data = NULL;
+            mdp->id = id;
+        }
+    }
+
+    return TRUE;
+}
+
+static char *write_metadata (WavpackMetadata *wpmd, char *outdata)
+{
+    unsigned char id = wpmd->id, wordlen [3];
+
+    wordlen [0] = (wpmd->byte_length + 1) >> 1;
+    wordlen [1] = (wpmd->byte_length + 1) >> 9;
+    wordlen [2] = (wpmd->byte_length + 1) >> 17;
+
+    if (wpmd->byte_length & 1)
+        id |= ID_ODD_SIZE;
+
+    if (wordlen [1] || wordlen [2])
+        id |= ID_LARGE;
+
+    *outdata++ = id;
+    *outdata++ = wordlen [0];
+
+    if (id & ID_LARGE) {
+        *outdata++ = wordlen [1];
+        *outdata++ = wordlen [2];
+    }
+
+    if (wpmd->data && wpmd->byte_length) {
+        memcpy (outdata, wpmd->data, wpmd->byte_length);
+        outdata += wpmd->byte_length;
+
+        if (wpmd->byte_length & 1)
+            *outdata++ = 0;
+    }
+
+    return outdata;
+}
+
+static int write_metadata_block (WavpackContext *wpc)
+{
+    char *block_buff, *block_ptr;
+    WavpackHeader *wphdr;
+
+    if (wpc->metacount) {
+        int metacount = wpc->metacount, block_size = sizeof (WavpackHeader);
+        WavpackMetadata *wpmdp = wpc->metadata;
+
+        while (metacount--) {
+            block_size += wpmdp->byte_length + (wpmdp->byte_length & 1);
+            block_size += (wpmdp->byte_length > 510) ? 4 : 2;
+            wpmdp++;
+        }
+
+        // allocate 6 extra bytes for 4-byte checksum (which we add last)
+        wphdr = (WavpackHeader *) (block_buff = malloc (block_size + 6));
+
+        CLEAR (*wphdr);
+        memcpy (wphdr->ckID, "wvpk", 4);
+        SET_TOTAL_SAMPLES (*wphdr, wpc->total_samples);
+        wphdr->version = wpc->stream_version;
+        wphdr->ckSize = block_size - 8;
+        wphdr->block_samples = 0;
+
+        block_ptr = (char *)(wphdr + 1);
+
+        wpmdp = wpc->metadata;
+
+        while (wpc->metacount) {
+            block_ptr = write_metadata (wpmdp, block_ptr);
+            wpc->metabytes -= wpmdp->byte_length;
+            free_metadata (wpmdp++);
+            wpc->metacount--;
+        }
+
+        free (wpc->metadata);
+        wpc->metadata = NULL;
+        // add a 4-byte checksum here (increases block size by 6)
+        block_add_checksum ((unsigned char *) block_buff, (unsigned char *) block_buff + (block_size += 6), 4);
+        WavpackNativeToLittleEndian ((WavpackHeader *) block_buff, WavpackHeaderFormat);
+
+        if (!wpc->blockout (wpc->wv_out, block_buff, block_size)) {
+            free (block_buff);
+            strcpy (wpc->error_message, "can't write WavPack data, disk probably full!");
+            return FALSE;
+        }
+
+        free (block_buff);
+    }
+
+    return TRUE;
+}
+
+void free_metadata (WavpackMetadata *wpmd)
+{
+    if (wpmd->data) {
+        free (wpmd->data);
+        wpmd->data = NULL;
+    }
+}
+
+// These two functions add or update the block checksums that were introduced in WavPack 5.0.
+// The presence of the checksum is indicated by a flag in the wavpack header (HAS_CHECKSUM)
+// and the actual metadata item should be the last one in the block, and can be either 2 or 4
+// bytes. Of course, older versions of the decoder will simply ignore both of these.
+
+static int block_add_checksum (unsigned char *buffer_start, unsigned char *buffer_end, int bytes)
+{
+    WavpackHeader *wphdr = (WavpackHeader *) buffer_start;
+#ifdef BITSTREAM_SHORTS
+    uint16_t *csptr = (uint16_t*) buffer_start;
+#else
+    unsigned char *csptr = buffer_start;
+#endif
+    int bcount = wphdr->ckSize + 8, wcount;
+    uint32_t csum = (uint32_t) -1;
+
+    if (bytes != 2 && bytes != 4)
+        return FALSE;
+
+    if (bcount < sizeof (WavpackHeader) || (bcount & 1) || buffer_start + bcount + 2 + bytes > buffer_end)
+        return FALSE;
+
+    wphdr->flags |= HAS_CHECKSUM;
+    wphdr->ckSize += 2 + bytes;
+    wcount = bcount >> 1;
+
+#ifdef BITSTREAM_SHORTS
+    while (wcount--)
+        csum = (csum * 3) + *csptr++;
+#else
+    WavpackNativeToLittleEndian ((WavpackHeader *) buffer_start, WavpackHeaderFormat);
+
+    while (wcount--) {
+        csum = (csum * 3) + csptr [0] + (csptr [1] << 8);
+        csptr += 2;
+    }
+
+    WavpackLittleEndianToNative ((WavpackHeader *) buffer_start, WavpackHeaderFormat);
+#endif
+
+    buffer_start += bcount;
+    *buffer_start++ = ID_BLOCK_CHECKSUM;
+    *buffer_start++ = bytes >> 1;
+
+    if (bytes == 4) {
+        *buffer_start++ = csum;
+        *buffer_start++ = csum >> 8;
+        *buffer_start++ = csum >> 16;
+        *buffer_start++ = csum >> 24;
+    }
+    else {
+        csum ^= csum >> 16;
+        *buffer_start++ = csum;
+        *buffer_start++ = csum >> 8;
+    }
+
+    return TRUE;
+}
+
+static void block_update_checksum (unsigned char *buffer_start)
+{
+    WavpackHeader *wphdr = (WavpackHeader *) buffer_start;
+    unsigned char *dp, meta_id, c1, c2;
+    uint32_t bcount, meta_bc;
+
+    if (!(wphdr->flags & HAS_CHECKSUM))
+        return;
+
+    bcount = wphdr->ckSize - sizeof (WavpackHeader) + 8;
+    dp = (unsigned char *)(wphdr + 1);
+
+    while (bcount >= 2) {
+        meta_id = *dp++;
+        c1 = *dp++;
+
+        meta_bc = c1 << 1;
+        bcount -= 2;
+
+        if (meta_id & ID_LARGE) {
+            if (bcount < 2)
+                return;
+
+            c1 = *dp++;
+            c2 = *dp++;
+            meta_bc += ((uint32_t) c1 << 9) + ((uint32_t) c2 << 17);
+            bcount -= 2;
+        }
+
+        if (bcount < meta_bc)
+            return;
+
+        if ((meta_id & ID_UNIQUE) == ID_BLOCK_CHECKSUM) {
+#ifdef BITSTREAM_SHORTS
+            uint16_t *csptr = (uint16_t*) buffer_start;
+#else
+            unsigned char *csptr = buffer_start;
+#endif
+            int wcount = (int)(dp - 2 - buffer_start) >> 1;
+            uint32_t csum = (uint32_t) -1;
+
+            if ((meta_id & ID_ODD_SIZE) || meta_bc < 2 || meta_bc > 4)
+                return;
+
+#ifdef BITSTREAM_SHORTS
+            while (wcount--)
+                csum = (csum * 3) + *csptr++;
+#else
+            WavpackNativeToLittleEndian ((WavpackHeader *) buffer_start, WavpackHeaderFormat);
+
+            while (wcount--) {
+                csum = (csum * 3) + csptr [0] + (csptr [1] << 8);
+                csptr += 2;
+            }
+
+            WavpackLittleEndianToNative ((WavpackHeader *) buffer_start, WavpackHeaderFormat);
+#endif
+
+            if (meta_bc == 4) {
+                *dp++ = csum;
+                *dp++ = csum >> 8;
+                *dp++ = csum >> 16;
+                *dp++ = csum >> 24;
+                return;
+            }
+            else {
+                csum ^= csum >> 16;
+                *dp++ = csum;
+                *dp++ = csum >> 8;
+                return;
+            }
+        }
+
+        bcount -= meta_bc;
+        dp += meta_bc;
+    }
+}
diff --git a/third_party/wavpack/src/pack_x64.S b/third_party/wavpack/src/pack_x64.S
new file mode 100644
index 0000000..a8798fd
--- /dev/null
+++ b/third_party/wavpack/src/pack_x64.S
@@ -0,0 +1,1941 @@
+############################################################################
+##                           **** WAVPACK ****                            ##
+##                  Hybrid Lossless Wavefile Compressor                   ##
+##              Copyright (c) 1998 - 2015 Conifer Software.               ##
+##                          All Rights Reserved.                          ##
+##      Distributed under the BSD Software License (see license.txt)      ##
+############################################################################
+
+        .intel_syntax noprefix
+        .text
+
+        .globl  _pack_decorr_stereo_pass_x64win
+        .globl  _pack_decorr_stereo_pass_cont_rev_x64win
+        .globl  _pack_decorr_stereo_pass_cont_x64win
+        .globl  _pack_decorr_mono_buffer_x64win
+        .globl  _pack_decorr_mono_pass_cont_x64win
+        .globl  _scan_max_magnitude_x64win
+        .globl  _log2buffer_x64win
+
+        .globl  pack_decorr_stereo_pass_x64win
+        .globl  pack_decorr_stereo_pass_cont_rev_x64win
+        .globl  pack_decorr_stereo_pass_cont_x64win
+        .globl  pack_decorr_mono_buffer_x64win
+        .globl  pack_decorr_mono_pass_cont_x64win
+        .globl  scan_max_magnitude_x64win
+        .globl  log2buffer_x64win
+
+        .globl  _pack_decorr_stereo_pass_x64
+        .globl  _pack_decorr_stereo_pass_cont_rev_x64
+        .globl  _pack_decorr_stereo_pass_cont_x64
+        .globl  _pack_decorr_mono_buffer_x64
+        .globl  _pack_decorr_mono_pass_cont_x64
+        .globl  _scan_max_magnitude_x64
+        .globl  _log2buffer_x64
+
+        .globl  pack_decorr_stereo_pass_x64
+        .globl  pack_decorr_stereo_pass_cont_rev_x64
+        .globl  pack_decorr_stereo_pass_cont_x64
+        .globl  pack_decorr_mono_buffer_x64
+        .globl  pack_decorr_mono_pass_cont_x64
+        .globl  scan_max_magnitude_x64
+        .globl  log2buffer_x64
+
+# This module contains X64 assembly optimized versions of functions required
+# to encode WavPack files.
+
+# This is an assembly optimized version of the following WavPack function:
+#
+# void pack_decorr_stereo_pass (
+#   struct decorr_pass *dpp,
+#   int32_t *buffer,
+#   int32_t sample_count);
+#
+# It performs a single pass of stereo decorrelation, in place, as specified
+# by the decorr_pass structure. Note that this function does NOT return the
+# dpp->samples_X[] values in the "normalized" positions for terms 1-8, so if
+# the number of samples is not a multiple of MAX_TERM, these must be moved if
+# they are to be used somewhere else.
+#
+# This is written to work on an X86-64 processor (also called the AMD64)
+# running in 64-bit mode and uses the MMX extensions to improve the
+# performance by processing both stereo channels together. It is based on
+# the original MMX code written by Joachim Henke that used MMX intrinsics
+# called from C. Many thanks to Joachim for that!
+#
+# An issue with using MMX for this is that the sample history array in the
+# decorr_pass structure contains separate arrays for each channel while the
+# MMX code wants there to be a single array of dual samples. The fix for
+# this is to convert the data in the arrays on entry and exit, and this is
+# made easy by the fact that the 8 MMX regsiters hold exactly the required
+# amount of data (64 bytes)!
+#
+# This version has entry points for both the System V ABI and the Windows
+# X64 ABI. It does not use the "red zone" or the "shadow area"; it saves the
+# non-volatile registers for both ABIs on the stack and allocates another
+# 8 bytes on the stack so that it's properly aligned. Note that it does NOT
+# provide unwind data for the Windows ABI (the unpack_x64.asm module for
+# MSVC does). The arguments are passed in registers:
+#
+#                             System V  Windows  
+#   struct decorr_pass *dpp     rdi       rcx
+#   int32_t *buffer             rsi       rdx
+#   int32_t sample_count        edx       r8d
+#
+# During the processing loops, the following registers are used:
+#
+#   rdi         buffer pointer
+#   rsi         termination buffer pointer
+#   rax,rbx,rdx used in default term to reduce calculation         
+#   rbp         decorr_pass pointer
+#   mm0, mm1    scratch
+#   mm2         original sample values
+#   mm3         correlation samples
+#   mm4         0 (for pcmpeqd)
+#   mm5         weights
+#   mm6         delta
+#   mm7         512 (for rounding)
+#
+
+_pack_decorr_stereo_pass_x64win:
+pack_decorr_stereo_pass_x64win:
+        push    rbp
+        push    rbx
+        push    rdi
+        push    rsi
+        sub     rsp, 8
+        mov     rdi, rcx                    # copy params from win regs to Linux regs
+        mov     rsi, rdx                    # so we can leave following code similar
+        mov     rdx, r8
+        mov     rcx, r9
+        jmp     benter
+
+_pack_decorr_stereo_pass_x64:
+pack_decorr_stereo_pass_x64:
+        push    rbp
+        push    rbx
+        push    rdi
+        push    rsi
+        sub     rsp, 8
+
+benter: mov     rbp, rdi                    # rbp = *dpp
+        mov     rdi, rsi                    # rdi = inbuffer
+        mov     esi, edx
+        shl     esi, 3
+        jz      bdone
+        add     rsi, rdi                    # rsi = termination buffer pointer
+
+        // convert samples_A and samples_B array into samples_AB array for MMX
+        // (the MMX registers provide exactly enough storage to do this easily)
+
+        movq        mm0, [rbp+16]
+        punpckldq   mm0, [rbp+48]
+        movq        mm1, [rbp+16]
+        punpckhdq   mm1, [rbp+48]
+        movq        mm2, [rbp+24]
+        punpckldq   mm2, [rbp+56]
+        movq        mm3, [rbp+24]
+        punpckhdq   mm3, [rbp+56]
+        movq        mm4, [rbp+32]
+        punpckldq   mm4, [rbp+64]
+        movq        mm5, [rbp+32]
+        punpckhdq   mm5, [rbp+64]
+        movq        mm6, [rbp+40]
+        punpckldq   mm6, [rbp+72]
+        movq        mm7, [rbp+40]
+        punpckhdq   mm7, [rbp+72]
+
+        movq    [rbp+16], mm0
+        movq    [rbp+24], mm1
+        movq    [rbp+32], mm2
+        movq    [rbp+40], mm3
+        movq    [rbp+48], mm4
+        movq    [rbp+56], mm5
+        movq    [rbp+64], mm6
+        movq    [rbp+72], mm7
+
+        mov     eax, 512
+        movd    mm7, eax
+        punpckldq mm7, mm7                  # mm7 = round (512)
+
+        mov     eax, [rbp+4]
+        movd    mm6, eax
+        punpckldq mm6, mm6                  # mm6 = delta (0-7)
+
+        mov     eax, 0xFFFF                 # mask high weights to zero for PMADDWD
+        movd    mm5, eax
+        punpckldq mm5, mm5                  # mm5 = weight mask 0x0000FFFF0000FFFF
+        pand    mm5, [rbp+8]                # mm5 = weight_AB masked to 16-bit
+
+        movq    mm4, [rbp+16]               # preload samples_AB[0]
+
+        mov     al, [rbp]                   # get term and vector to correct loop
+        cmp     al, 17
+        je      buff_term_17_loop
+        cmp     al, 18
+        je      buff_term_18_loop
+        cmp     al, -1
+        je      buff_term_minus_1_loop
+        cmp     al, -2
+        je      buff_term_minus_2_loop
+        cmp     al, -3
+        je      buff_term_minus_3_loop
+
+        pxor    mm4, mm4                    # mm4 = 0 (for pcmpeqd)
+        xor     eax, eax
+        xor     ebx, ebx
+        add     bl, [rbp]
+        mov     ecx, 7
+        and     ebx, ecx
+        jmp     buff_default_term_loop
+
+        .balign  64
+
+buff_default_term_loop:
+        movq    mm2, [rdi]                  # mm2 = left_right
+        movq    mm3, [rbp+16+rax*8]
+        inc     eax
+        and     eax, ecx
+        movq    [rbp+16+rbx*8], mm2
+        inc     ebx
+        and     ebx, ecx
+
+        movq    mm1, mm3
+        paddd   mm1, mm1
+        psrlw   mm1, 1
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        psrld   mm0, 15
+        pmaddwd mm0, mm5
+
+        pslld   mm0, 5
+        paddd   mm1, mm7                    # add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    # add shifted sums
+        movq    mm0, mm3
+        movq    [rdi], mm2                  # store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
+        add     rdi, 8
+        pcmpeqd mm2, mm4                    # mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm4                    # mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    # mm2 = 1s if either was zero
+        pandn   mm2, mm6                    # mask delta with zeros check
+        pxor    mm5, mm0
+        paddw   mm5, mm2                    # and add to weight_AB
+        pxor    mm5, mm0
+        cmp     rdi, rsi
+        jnz     buff_default_term_loop
+
+        jmp     bdone
+
+        .balign  64
+
+buff_term_17_loop:
+        movq    mm3, mm4                    # get previous calculated value
+        paddd   mm3, mm4
+        psubd   mm3, [rbp+24]
+        movq    [rbp+24], mm4
+
+        movq    mm1, mm3
+        paddd   mm1, mm1
+        psrlw   mm1, 1
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        psrld   mm0, 15
+        pmaddwd mm0, mm5
+
+        movq    mm2, [rdi]                  # mm2 = left_right
+        movq    mm4, mm2
+        pslld   mm0, 5
+        paddd   mm1, mm7                    # add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    # add shifted sums
+        movq    mm0, mm3
+        movq    [rdi], mm2                  # store result
+        pxor    mm1, mm1
+        pxor    mm0, mm2
+        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
+        add     rdi, 8
+        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    # mm2 = 1s if either was zero
+        pandn   mm2, mm6                    # mask delta with zeros check
+        pxor    mm5, mm0
+        paddw   mm5, mm2                    # and add to weight_AB
+        pxor    mm5, mm0
+        cmp     rdi, rsi
+        jnz     buff_term_17_loop
+
+        movq    [rbp+16], mm4               # post-store samples_AB[0]
+        jmp     bdone
+
+        .balign  64
+
+buff_term_18_loop:
+        movq    mm3, mm4                    # get previous calculated value
+        psubd   mm3, [rbp+24]
+        psrad   mm3, 1
+        paddd   mm3, mm4                    # mm3 = sam_AB
+        movq    [rbp+24], mm4
+
+        movq    mm1, mm3
+        paddd   mm1, mm1
+        psrlw   mm1, 1
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        psrld   mm0, 15
+        pmaddwd mm0, mm5
+
+        movq    mm2, [rdi]                  # mm2 = left_right
+        movq    mm4, mm2
+        pslld   mm0, 5
+        paddd   mm1, mm7                    # add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    # add shifted sums
+        movq    mm0, mm3
+        movq    [rdi], mm2                  # store result
+        pxor    mm1, mm1
+        pxor    mm0, mm2
+        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
+        add     rdi, 8
+        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    # mm2 = 1s if either was zero
+        pandn   mm2, mm6                    # mask delta with zeros check
+        pxor    mm5, mm0
+        paddw   mm5, mm2                    # and add to weight_AB
+        pxor    mm5, mm0
+        cmp     rdi, rsi
+        jnz     buff_term_18_loop
+
+        movq    [rbp+16], mm4               # post-store samples_AB[0]
+        jmp     bdone
+
+        .balign  64
+
+buff_term_minus_1_loop:
+        movq    mm3, mm4                    # mm3 = previous calculated value
+        movq    mm2, [rdi]                  # mm2 = left_right
+        movq    mm4, mm2
+        psrlq   mm4, 32
+        punpckldq mm3, mm2                  # mm3 = sam_AB
+
+        movq    mm1, mm3
+        paddd   mm1, mm1
+        psrlw   mm1, 1
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        psrld   mm0, 15
+        pmaddwd mm0, mm5
+
+        pslld   mm0, 5
+        paddd   mm1, mm7                    # add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    # add shifted sums
+        movq    mm0, mm3
+        movq    [rdi], mm2                  # store result
+        pxor    mm1, mm1
+        pxor    mm0, mm2
+        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
+        add     rdi, 8
+        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    # mm2 = 1s if either was zero
+        pandn   mm2, mm6                    # mask delta with zeros check
+        pcmpeqd mm1, mm1
+        psubd   mm1, mm7
+        psubd   mm1, mm7
+        psubd   mm1, mm0
+        pxor    mm5, mm0
+        paddw   mm5, mm1
+        paddusw mm5, mm2                    # and add to weight_AB
+        psubw   mm5, mm1
+        pxor    mm5, mm0
+        cmp     rdi, rsi
+        jnz     buff_term_minus_1_loop
+
+        movq    [rbp+16], mm4               # post-store samples_AB[0]
+        jmp     bdone
+
+        .balign  64
+
+buff_term_minus_2_loop:
+        movq    mm2, [rdi]                  # mm2 = left_right
+        movq    mm3, mm2
+        psrlq   mm3, 32
+        por     mm3, mm4
+        punpckldq mm4, mm2
+
+        movq    mm1, mm3
+        paddd   mm1, mm1
+        psrlw   mm1, 1
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        psrld   mm0, 15
+        pmaddwd mm0, mm5
+
+        pslld   mm0, 5
+        paddd   mm1, mm7                    # add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    # add shifted sums
+        movq    mm0, mm3
+        movq    [rdi], mm2                  # store result
+        pxor    mm1, mm1
+        pxor    mm0, mm2
+        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
+        add     rdi, 8
+        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    # mm2 = 1s if either was zero
+        pandn   mm2, mm6                    # mask delta with zeros check
+        pcmpeqd mm1, mm1
+        psubd   mm1, mm7
+        psubd   mm1, mm7
+        psubd   mm1, mm0
+        pxor    mm5, mm0
+        paddw   mm5, mm1
+        paddusw mm5, mm2                    # and add to weight_AB
+        psubw   mm5, mm1
+        pxor    mm5, mm0
+        cmp     rdi, rsi
+        jnz     buff_term_minus_2_loop
+
+        movq    [rbp+16], mm4               # post-store samples_AB[0]
+        jmp     bdone
+
+        .balign  64
+
+buff_term_minus_3_loop:
+        movq    mm2, [rdi]                  # mm2 = left_right
+        movq    mm3, mm4                    # mm3 = previous calculated value
+        movq    mm4, mm2                    # mm0 = swap dwords of new data
+        psrlq   mm4, 32
+        punpckldq mm4, mm2                  # mm3 = sam_AB
+
+        movq    mm1, mm3
+        paddd   mm1, mm1
+        psrlw   mm1, 1
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        psrld   mm0, 15
+        pmaddwd mm0, mm5
+
+        pslld   mm0, 5
+        paddd   mm1, mm7                    # add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    # add shifted sums
+        movq    mm0, mm3
+        movq    [rdi], mm2                  # store result
+        pxor    mm1, mm1
+        pxor    mm0, mm2
+        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
+        add     rdi, 8
+        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    # mm2 = 1s if either was zero
+        pandn   mm2, mm6                    # mask delta with zeros check
+        pcmpeqd mm1, mm1
+        psubd   mm1, mm7
+        psubd   mm1, mm7
+        psubd   mm1, mm0
+        pxor    mm5, mm0
+        paddw   mm5, mm1
+        paddusw mm5, mm2                    # and add to weight_AB
+        psubw   mm5, mm1
+        pxor    mm5, mm0
+        cmp     rdi, rsi
+        jnz     buff_term_minus_3_loop
+
+        movq    [rbp+16], mm4               # post-store samples_AB[0]
+
+bdone:  pslld   mm5, 16                     # sign-extend 16-bit weights back to dwords
+        psrad   mm5, 16
+        movq    [rbp+8], mm5                # put weight_AB back
+
+        // convert samples_AB array back into samples_A and samples_B
+
+        movq    mm0, [rbp+16]
+        movq    mm1, [rbp+24]
+        movq    mm2, [rbp+32]
+        movq    mm3, [rbp+40]
+        movq    mm4, [rbp+48]
+        movq    mm5, [rbp+56]
+        movq    mm6, [rbp+64]
+        movq    mm7, [rbp+72]
+
+        movd    [rbp+16], mm0
+        movd    [rbp+20], mm1
+        movd    [rbp+24], mm2
+        movd    [rbp+28], mm3
+        movd    [rbp+32], mm4
+        movd    [rbp+36], mm5
+        movd    [rbp+40], mm6
+        movd    [rbp+44], mm7
+
+        punpckhdq   mm0, mm0
+        punpckhdq   mm1, mm1
+        punpckhdq   mm2, mm2
+        punpckhdq   mm3, mm3
+        punpckhdq   mm4, mm4
+        punpckhdq   mm5, mm5
+        punpckhdq   mm6, mm6
+        punpckhdq   mm7, mm7
+
+        movd    [rbp+48], mm0
+        movd    [rbp+52], mm1
+        movd    [rbp+56], mm2
+        movd    [rbp+60], mm3
+        movd    [rbp+64], mm4
+        movd    [rbp+68], mm5
+        movd    [rbp+72], mm6
+        movd    [rbp+76], mm7
+
+        emms
+
+        add     rsp, 8
+        pop     rsi
+        pop     rdi
+        pop     rbx
+        pop     rbp
+        ret
+
+
+# These are assembly optimized version of the following WavPack functions:
+#
+# void pack_decorr_stereo_pass_cont (
+#   struct decorr_pass *dpp,
+#   int32_t *in_buffer,
+#   int32_t *out_buffer,
+#   int32_t sample_count);
+#
+# void pack_decorr_stereo_pass_cont_rev (
+#   struct decorr_pass *dpp,
+#   int32_t *in_buffer,
+#   int32_t *out_buffer,
+#   int32_t sample_count);
+#
+# It performs a single pass of stereo decorrelation, transfering from the
+# input buffer to the output buffer. Note that this version of the function
+# requires that the up to 8 previous (depending on dpp->term) stereo samples
+# are visible and correct. In other words, it ignores the "samples_*"
+# fields in the decorr_pass structure and gets the history data directly
+# from the source buffer. It does, however, return the appropriate history
+# samples to the decorr_pass structure before returning.
+#
+# This is written to work on an X86-64 processor (also called the AMD64)
+# running in 64-bit mode and uses the MMX extensions to improve the
+# performance by processing both stereo channels together. It is based on
+# the original MMX code written by Joachim Henke that used MMX intrinsics
+# called from C. Many thanks to Joachim for that!
+#
+# This version has entry points for both the System V ABI and the Windows
+# X64 ABI. It does not use the "red zone" or the "shadow area"; it saves the
+# non-volatile registers for both ABIs on the stack and allocates another
+# 8 bytes on the stack to store the dpp pointer. Note that it does NOT
+# provide unwind data for the Windows ABI (the unpack_x64.asm module for
+# MSVC does). The arguments are passed in registers:
+#
+#                             System V  Windows  
+#   struct decorr_pass *dpp     rdi       rcx
+#   int32_t *in_buffer          rsi       rdx
+#   int32_t *out_buffer         rdx       r8
+#   int32_t sample_count        ecx       r9
+#
+# During the processing loops, the following registers are used:
+#
+#   rdi         input buffer pointer
+#   rsi         direction (-8 forward, +8 reverse)
+#   rbx         delta from input to output buffer
+#   ecx         sample count
+#   rdx         sign (dir) * term * -8 (terms 1-8 only)
+#   mm0, mm1    scratch
+#   mm2         original sample values
+#   mm3         correlation samples
+#   mm4         weight sums
+#   mm5         weights
+#   mm6         delta
+#   mm7         512 (for rounding)
+#
+# stack usage:
+#
+# [rsp+0] = *dpp
+#
+
+_pack_decorr_stereo_pass_cont_rev_x64win:
+pack_decorr_stereo_pass_cont_rev_x64win:
+        mov     rax, 8
+        jmp     wstart
+
+_pack_decorr_stereo_pass_cont_x64win:
+pack_decorr_stereo_pass_cont_x64win:
+        mov     rax, -8
+        jmp     wstart
+
+wstart: push    rbp
+        push    rbx
+        push    rdi
+        push    rsi
+        sub     rsp, 8
+        mov     rdi, rcx                    # copy params from win regs to Linux regs
+        mov     rsi, rdx                    # so we can leave following code similar
+        mov     rdx, r8
+        mov     rcx, r9
+        jmp     enter
+
+_pack_decorr_stereo_pass_cont_rev_x64:
+pack_decorr_stereo_pass_cont_rev_x64:
+        mov     rax, 8
+        jmp     start
+
+_pack_decorr_stereo_pass_cont_x64:
+pack_decorr_stereo_pass_cont_x64:
+        mov     rax, -8
+        jmp     start
+
+start:  push    rbp
+        push    rbx
+        push    rdi
+        push    rsi
+        sub     rsp, 8
+
+enter:  mov     [rsp], rdi                  # [rbp-8] = *dpp
+        mov     rdi, rsi                    # rdi = inbuffer
+        mov     rsi, rax                    # get direction from rax
+
+        mov     eax, 512
+        movd    mm7, eax
+        punpckldq mm7, mm7                  # mm7 = round (512)
+
+        mov     rax, [rsp]                  # access dpp
+        mov     eax, [rax+4]
+        movd    mm6, eax
+        punpckldq mm6, mm6                  # mm6 = delta (0-7)
+
+        mov     rax, [rsp]                  # access dpp
+        movq    mm5, [rax+8]                # mm5 = weight_AB
+        movq    mm4, [rax+88]               # mm4 = sum_AB
+
+        mov     rbx, rdx                    # rbx = out_buffer (rdx) - in_buffer (rdi)
+        sub     rbx, rdi
+
+        mov     rax, [rsp]                  # *eax = dpp
+        movsxd  rax, DWORD PTR [rax]        # get term and vector to correct loop
+        cmp     al, 17
+        je      term_17_loop
+        cmp     al, 18
+        je      term_18_loop
+        cmp     al, -1
+        je      term_minus_1_loop
+        cmp     al, -2
+        je      term_minus_2_loop
+        cmp     al, -3
+        je      term_minus_3_loop
+
+        shl     rax, 3
+        mov     rdx, rax                    # rdx = term * 8 to index correlation sample
+        test    rsi, rsi                    # test direction
+        jns     default_term_loop
+        neg     rdx
+        jmp     default_term_loop
+
+        .balign  64
+
+default_term_loop:
+        movq    mm3, [rdi+rdx]              # mm3 = sam_AB
+
+        movq    mm1, mm3
+        pslld   mm1, 17
+        psrld   mm1, 17
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        pslld   mm0, 1
+        psrld   mm0, 16
+        pmaddwd mm0, mm5
+
+        movq    mm2, [rdi]                  # mm2 = left_right
+        pslld   mm0, 5
+        paddd   mm1, mm7                    # add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    # add shifted sums
+        movq    mm0, mm3
+        movq    [rdi+rbx], mm2              # store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
+        sub     rdi, rsi
+        pxor    mm1, mm1                    # mm1 = zero
+        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    # mm2 = 1s if either was zero
+        pandn   mm2, mm6                    # mask delta with zeros check
+        pxor    mm5, mm0
+        paddd   mm5, mm2                    # and add to weight_AB
+        pxor    mm5, mm0
+        paddd   mm4, mm5                    # add weights to sum
+        dec     ecx
+        jnz     default_term_loop
+
+        mov     rax, [rsp]                  # access dpp
+        movq    [rax+8], mm5                # put weight_AB back
+        movq    [rax+88], mm4               # put sum_AB back
+        emms
+
+        mov     rdx, [rsp]                  # access dpp with rdx
+        movsxd  rcx, DWORD PTR [rdx]        # rcx = dpp->term
+
+default_store_samples:
+        dec     rcx
+        add     rdi, rsi                    # back up one full sample
+        mov     eax, [rdi+4]
+        mov     [rdx+rcx*4+48], eax         # store samples_B [ecx]
+        mov     eax, [rdi]
+        mov     [rdx+rcx*4+16], eax         # store samples_A [ecx]
+        test    rcx, rcx
+        jnz     default_store_samples
+        jmp     done
+
+        .balign  64
+
+term_17_loop:
+        movq    mm3, [rdi+rsi]              # get previous calculated value
+        paddd   mm3, mm3
+        psubd   mm3, [rdi+rsi*2]
+
+        movq    mm1, mm3
+        pslld   mm1, 17
+        psrld   mm1, 17
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        pslld   mm0, 1
+        psrld   mm0, 16
+        pmaddwd mm0, mm5
+
+        movq    mm2, [rdi]                  # mm2 = left_right
+        pslld   mm0, 5
+        paddd   mm1, mm7                    # add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    # add shifted sums
+        movq    mm0, mm3
+        movq    [rdi+rbx], mm2              # store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
+        sub     rdi, rsi
+        pxor    mm1, mm1                    # mm1 = zero
+        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    # mm2 = 1s if either was zero
+        pandn   mm2, mm6                    # mask delta with zeros check
+        pxor    mm5, mm0
+        paddd   mm5, mm2                    # and add to weight_AB
+        pxor    mm5, mm0
+        paddd   mm4, mm5                    # add weights to sum
+        dec     ecx
+        jnz     term_17_loop
+
+        mov     rax, [rsp]                  # access dpp
+        movq    [rax+8], mm5                # put weight_AB back
+        movq    [rax+88], mm4               # put sum_AB back
+        emms
+        jmp     term_1718_common_store
+
+        .balign  64
+
+term_18_loop:
+        movq    mm3, [rdi+rsi]              # get previous calculated value
+        movq    mm0, mm3
+        psubd   mm3, [rdi+rsi*2]
+        psrad   mm3, 1
+        paddd   mm3, mm0                    # mm3 = sam_AB
+
+        movq    mm1, mm3
+        pslld   mm1, 17
+        psrld   mm1, 17
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        pslld   mm0, 1
+        psrld   mm0, 16
+        pmaddwd mm0, mm5
+
+        movq    mm2, [rdi]                  # mm2 = left_right
+        pslld   mm0, 5
+        paddd   mm1, mm7                    # add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    # add shifted sums
+        movq    mm0, mm3
+        movq    [rdi+rbx], mm2              # store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
+        sub     rdi, rsi
+        pxor    mm1, mm1                    # mm1 = zero
+        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    # mm2 = 1s if either was zero
+        pandn   mm2, mm6                    # mask delta with zeros check
+        pxor    mm5, mm0
+        paddd   mm5, mm2                    # and add to weight_AB
+        pxor    mm5, mm0
+        dec     ecx
+        paddd   mm4, mm5                    # add weights to sum
+        jnz     term_18_loop
+
+        mov     rax, [rsp]                  # access dpp
+        movq    [rax+8], mm5                # put weight_AB back
+        movq    [rax+88], mm4               # put sum_AB back
+        emms
+
+term_1718_common_store:
+
+        mov     rax, [rsp]                  # access dpp
+        add     rdi, rsi                    # back up a full sample
+        mov     edx, [rdi+4]                # dpp->samples_B [0] = iptr [-1];
+        mov     [rax+48], edx
+        mov     edx, [rdi]                  # dpp->samples_A [0] = iptr [-2];
+        mov     [rax+16], edx
+        add     rdi, rsi                    # back up another sample
+        mov     edx, [rdi+4]                # dpp->samples_B [1] = iptr [-3];
+        mov     [rax+52], edx
+        mov     edx, [rdi]                  # dpp->samples_A [1] = iptr [-4];
+        mov     [rax+20], edx
+        jmp     done
+
+        .balign  64
+
+term_minus_1_loop:
+        movq    mm3, [rdi+rsi]              # mm3 = previous calculated value
+        movq    mm2, [rdi]                  # mm2 = left_right
+        psrlq   mm3, 32
+        punpckldq mm3, mm2                  # mm3 = sam_AB
+
+        movq    mm1, mm3
+        pslld   mm1, 17
+        psrld   mm1, 17
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        pslld   mm0, 1
+        psrld   mm0, 16
+        pmaddwd mm0, mm5
+
+        pslld   mm0, 5
+        paddd   mm1, mm7                    # add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    # add shifted sums
+        movq    mm0, mm3
+        movq    [rdi+rbx], mm2              # store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
+        sub     rdi, rsi
+        pxor    mm1, mm1                    # mm1 = zero
+        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    # mm2 = 1s if either was zero
+        pandn   mm2, mm6                    # mask delta with zeros check
+        pcmpeqd mm1, mm1
+        psubd   mm1, mm7
+        psubd   mm1, mm7
+        psubd   mm1, mm0
+        pxor    mm5, mm0
+        paddd   mm5, mm1
+        paddusw mm5, mm2                    # and add to weight_AB
+        psubd   mm5, mm1
+        pxor    mm5, mm0
+        paddd   mm4, mm5                    # add weights to sum
+        dec     ecx
+        jnz     term_minus_1_loop
+
+        mov     rax, [rsp]                  # access dpp
+        movq    [rax+8], mm5                # put weight_AB back
+        movq    [rax+88], mm4               # put sum_AB back
+        emms
+
+        add     rdi, rsi                    # back up a full sample
+        mov     edx, [rdi+4]                # dpp->samples_A [0] = iptr [-1];
+        mov     rax, [rsp]
+        mov     [rax+16], edx
+        jmp     done
+
+        .balign  64
+
+term_minus_2_loop:
+        movq    mm2, [rdi]                  # mm2 = left_right
+        movq    mm3, mm2                    # mm3 = swap dwords
+        psrlq   mm3, 32
+        punpckldq mm3, [rdi+rsi]            # mm3 = sam_AB
+
+        movq    mm1, mm3
+        pslld   mm1, 17
+        psrld   mm1, 17
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        pslld   mm0, 1
+        psrld   mm0, 16
+        pmaddwd mm0, mm5
+
+        pslld   mm0, 5
+        paddd   mm1, mm7                    # add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    # add shifted sums
+        movq    mm0, mm3
+        movq    [rdi+rbx], mm2              # store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
+        sub     rdi, rsi
+        pxor    mm1, mm1                    # mm1 = zero
+        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    # mm2 = 1s if either was zero
+        pandn   mm2, mm6                    # mask delta with zeros check
+        pcmpeqd mm1, mm1
+        psubd   mm1, mm7
+        psubd   mm1, mm7
+        psubd   mm1, mm0
+        pxor    mm5, mm0
+        paddd   mm5, mm1
+        paddusw mm5, mm2                    # and add to weight_AB
+        psubd   mm5, mm1
+        pxor    mm5, mm0
+        paddd   mm4, mm5                    # add weights to sum
+        dec     ecx
+        jnz     term_minus_2_loop
+
+        mov     rax, [rsp]                  # access dpp
+        movq    [rax+8], mm5                # put weight_AB back
+        movq    [rax+88], mm4               # put sum_AB back
+        emms
+
+        add     rdi, rsi                    # back up a full sample
+        mov     edx, [rdi]                  # dpp->samples_B [0] = iptr [-2];
+        mov     rax, [rsp]
+        mov     [rax+48], edx
+        jmp     done
+
+        .balign  64
+
+term_minus_3_loop:
+        movq    mm0, [rdi+rsi]              # mm0 = previous calculated value
+        movq    mm3, mm0                    # mm3 = swap dwords
+        psrlq   mm3, 32
+        punpckldq mm3, mm0                  # mm3 = sam_AB
+
+        movq    mm1, mm3
+        pslld   mm1, 17
+        psrld   mm1, 17
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        pslld   mm0, 1
+        psrld   mm0, 16
+        pmaddwd mm0, mm5
+
+        movq    mm2, [rdi]                  # mm2 = left_right
+        pslld   mm0, 5
+        paddd   mm1, mm7                    # add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    # add shifted sums
+        movq    mm0, mm3
+        movq    [rdi+rbx], mm2              # store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
+        sub     rdi, rsi
+        pxor    mm1, mm1                    # mm1 = zero
+        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    # mm2 = 1s if either was zero
+        pandn   mm2, mm6                    # mask delta with zeros check
+        pcmpeqd mm1, mm1
+        psubd   mm1, mm7
+        psubd   mm1, mm7
+        psubd   mm1, mm0
+        pxor    mm5, mm0
+        paddd   mm5, mm1
+        paddusw mm5, mm2                    # and add to weight_AB
+        psubd   mm5, mm1
+        pxor    mm5, mm0
+        paddd   mm4, mm5                    # add weights to sum
+        dec     ecx
+        jnz     term_minus_3_loop
+
+        mov     rax, [rsp]                  # access dpp
+        movq    [rax+8], mm5                # put weight_AB back
+        movq    [rax+88], mm4               # put sum_AB back
+        emms
+
+        add     rdi, rsi                    # back up a full sample
+        mov     edx, [rdi+4]                # dpp->samples_A [0] = iptr [-1];
+        mov     rax, [rsp]
+        mov     [rax+16], edx
+        mov     edx, [rdi]                  # dpp->samples_B [0] = iptr [-2];
+        mov     [rax+48], edx
+
+done:   add     rsp, 8
+        pop     rsi
+        pop     rdi
+        pop     rbx
+        pop     rbp
+        ret
+
+
+# This is an assembly optimized version of the following WavPack function:
+#
+# uint32_t decorr_mono_buffer (int32_t *buffer,
+#                              struct decorr_pass *decorr_passes,
+#                              int32_t num_terms,
+#                              int32_t sample_count)
+#
+# Decorrelate a buffer of mono samples, in place, as specified by the array
+# of decorr_pass structures. Note that this function does NOT return the
+# dpp->samples_X[] values in the "normalized" positions for terms 1-8, so if
+# the number of samples is not a multiple of MAX_TERM, these must be moved if
+# they are to be used somewhere else. The magnitude of the output samples is
+# accumulated and returned (see scan_max_magnitude() for more details). By
+# using the overflow detection of the multiply instruction, this detects
+# when the "long_math" varient is required.
+#
+# For the fastest possible operation with the four "common" decorrelation
+# filters (i.e, fast, normal, high and very high) this function can be
+# configured to include hardcoded versions of these filters that are created
+# using macros. In that case, the passed filter is checked to make sure that
+# it matches one of the four. If it doesn't, or if the hardcoded flters are
+# not enabled, a "general" version of the decorrelation loop is used. This
+# variable enables the hardcoded filters and can be disabled if there are
+# problems with the code or macros:
+
+        HARDCODED_FILTERS = 1
+
+# Entry points for both the System V ABI and the Windows X64 ABI are provided.
+# It does not use the "red zone" or the "shadow area"; it saves the
+# non-volatile registers for both ABIs on the stack and allocates another
+# 24 bytes on the stack to store the dpp pointer and the sample count. Note
+# that it does NOT provide unwind data for the Windows ABI (the unpack_x64.asm
+# module for MSVC does). The arguments are passed in registers:
+#
+#                             System V  Windows  
+#   int32_t *buffer             rdi       rcx
+#   struct decorr_pass *dpp     rsi       rdx
+#   int32_t num_terms           rdx       r8
+#   int32_t sample_count        ecx       r9
+#
+# stack usage:
+#
+# [rsp+8] = sample_count
+# [rsp+0] = decorr_passes (unused in hardcoded filter case)
+#
+# register usage:
+#
+# ecx = sample being decorrelated
+# esi = sample up counter
+# rdi = *buffer
+# rbp = *dpp
+# r8 = magnitude accumulator
+# r9 = dpp end ptr (unused in hardcoded filter case)
+#
+        .if     HARDCODED_FILTERS
+#
+# This macro is used for checking the decorr_passes array to make sure that the terms match
+# the hardcoded terms. The terms of these filters are the first element in the tables defined
+# in decorr_tables.h (with the negative terms replaced with 1).
+#
+
+        .macro  chkterm term rbp_offset
+        cmp     BYTE PTR [rbp], \term
+        jnz     use_general_version
+        add     rbp, \rbp_offset
+        .endm
+#
+# This macro processes the single specified term (with a fixed delta of 2) and updates the
+# term pointer (rbp) with the specified offset when done. It assumes the following registers:
+#
+# ecx = sample being decorrelated
+# esi = sample up counter (used for terms 1-8)
+# rbp = decorr_pass pointer for this term (updated with "rbp_offset" when done)
+# rax, rbx, rdx = scratch
+#
+        .macro  exeterm term rbp_offset
+
+        .if     \term <= 8
+        mov     eax, esi
+        and     eax, 7
+        mov     ebx, [rbp+16+rax*4]
+        .if     \term != 8
+        add     eax, \term
+        and     eax, 7
+        .endif
+        mov     [rbp+16+rax*4], ecx
+
+        .elseif     \term == 17
+
+        mov     edx, [rbp+16]               # handle term 17
+        mov     [rbp+16], ecx
+        lea     ebx, [rdx+rdx]
+        sub     ebx, [rbp+20]
+        mov     [rbp+20], edx
+
+        .else
+
+        mov     edx, [rbp+16]               # handle term 18
+        mov     [rbp+16], ecx
+        lea     ebx, [rdx+rdx*2]
+        sub     ebx, [rbp+20]
+        sar     ebx, 1
+        mov     [rbp+20], edx
+
+        .endif
+
+        mov     eax, [rbp+8]
+        imul    eax, ebx                    # 32-bit multiply is almost always enough
+        jo      1f                          # but handle overflow if it happens
+        sar     eax, 10
+        sbb     ecx, eax                    # borrow flag provides rounding
+        jmp     2f
+1:      mov     eax, [rbp+8]                # perform 64-bit multiply on overflow
+        imul    ebx
+        shr     eax, 10
+        sbb     ecx, eax
+        shl     edx, 22
+        sub     ecx, edx
+2:      je      3f
+        test    ebx, ebx
+        je      3f
+        xor     ebx, ecx
+        sar     ebx, 30
+        or      ebx, 1                      # this generates delta of 1
+        shl     ebx, 1                      # this generates delta of 2
+        add     [rbp+8], ebx
+3:      add     rbp, \rbp_offset
+
+        .endm
+
+        .endif                              # end of macro definitions
+
+# entry points of function
+
+_pack_decorr_mono_buffer_x64win:
+pack_decorr_mono_buffer_x64win:
+        push    rbp
+        push    rbx
+        push    rdi
+        push    rsi
+        sub     rsp, 24
+        mov     rdi, rcx                    # copy params from win regs to Linux regs
+        mov     rsi, rdx                    # so we can leave following code similar
+        mov     rdx, r8
+        mov     rcx, r9
+        jmp     mentry
+
+_pack_decorr_mono_buffer_x64:
+pack_decorr_mono_buffer_x64:
+        push    rbp
+        push    rbx
+        push    rdi
+        push    rsi
+        sub     rsp, 24
+
+mentry: mov     [rsp+8], rcx                # [rsp+8] = sample count
+        mov     [rsp], rsi                  # [rsp+0] = decorr_passes
+        xor     r8, r8                      # r8 = max magnitude mask
+        xor     esi, esi                    # up counter = 0
+
+        and     ecx, ecx                    # test & handle zero sample count & zero term count
+        jz      mexit
+        and     edx, edx
+        jz      mexit
+
+        .if     HARDCODED_FILTERS
+
+# first check to make sure all the "deltas" are 2
+
+        mov     rbp, [rsp]                  # rbp is decorr_pass pointer
+        mov     ebx, edx                    # get term count
+deltas: cmp     BYTE PTR [rbp+4], 2         # make sure all the deltas are 2
+        jnz     use_general_version         # if any aren't, use general case
+        add     rbp, 96
+        dec     ebx
+        jnz     deltas
+
+        mov     rbp, [rsp]                  # rbp is decorr_pass pointer
+        cmp     dl, 2                       # 2 terms is "fast"
+        jnz     nfast
+        chkterm 18,  96                     # check "fast" terms
+        chkterm 17, -96
+        jmp     mono_fast_loop
+
+nfast:  cmp     dl, 5                       # 5 terms is "normal"
+        jnz     nnorm
+        chkterm 18, 96                      # check "normal" terms
+        chkterm 18, 96
+        chkterm 2,  96
+        chkterm 17, 96
+        chkterm 3,  96*-4
+        jmp     mono_normal_loop
+
+nnorm:  cmp     dl, 10                      # 10 terms is "high"
+        jnz     nhigh
+        chkterm 18, 96                      # check "high" terms
+        chkterm 18, 96
+        chkterm 18, 96
+        chkterm 1,  96
+        chkterm 2,  96
+        chkterm 3,  96
+        chkterm 5,  96
+        chkterm 1,  96
+        chkterm 17, 96
+        chkterm 4,  96*-9
+        jmp     mono_high_loop
+
+nhigh:  cmp     dl, 16                      # 16 terms is "very high"
+        jnz     use_general_version         # if none of these, use general version
+        chkterm 18, 96                      # else check "very high" terms
+        chkterm 18, 96
+        chkterm 2,  96
+        chkterm 3,  96
+        chkterm 1,  96
+        chkterm 18, 96
+        chkterm 2,  96
+        chkterm 4,  96
+        chkterm 7,  96
+        chkterm 5,  96
+        chkterm 3,  96
+        chkterm 6,  96
+        chkterm 8,  96
+        chkterm 1,  96
+        chkterm 18, 96
+        chkterm 2,  96*-15
+        jmp     mono_vhigh_loop
+
+        .balign  64
+
+# hardcoded "fast" decorrelation loop
+
+mono_fast_loop:
+        mov     ecx, [rdi+rsi*4]             # ecx is the sample we're decorrelating
+
+        exeterm 18,  96
+        exeterm 17, -96
+
+        mov     [rdi+rsi*4], ecx            # store completed sample
+        mov     eax, ecx                    # update magnitude mask
+        cdq
+        xor     eax, edx
+        or      r8, rax
+        inc     esi                         # increment sample index
+        cmp     esi, [rsp+8]
+        jnz     mono_fast_loop              # loop back for all samples
+        jmp     mexit                       # then exit
+
+        .balign  64
+
+# hardcoded "normal" decorrelation loop
+
+mono_normal_loop:
+        mov     ecx, [rdi+rsi*4]             # ecx is the sample we're decorrelating
+
+        exeterm 18, 96
+        exeterm 18, 96
+        exeterm 2,  96
+        exeterm 17, 96
+        exeterm 3,  96*-4
+
+        mov     [rdi+rsi*4], ecx            # store completed sample
+        mov     eax, ecx                    # update magnitude mask
+        cdq
+        xor     eax, edx
+        or      r8, rax
+        inc     esi                         # increment sample index
+        cmp     esi, [rsp+8]
+        jnz     mono_normal_loop            # loop back for all samples
+        jmp     mexit                       # then exit
+
+        .balign  64
+
+# hardcoded "high" decorrelation loop
+
+mono_high_loop:
+        mov     ecx, [rdi+rsi*4]             # ecx is the sample we're decorrelating
+
+        exeterm 18, 96
+        exeterm 18, 96
+        exeterm 18, 96
+        exeterm 1,  96
+        exeterm 2,  96
+        exeterm 3,  96
+        exeterm 5,  96
+        exeterm 1,  96
+        exeterm 17, 96
+        exeterm 4,  96*-9
+
+        mov     [rdi+rsi*4], ecx            # store completed sample
+        mov     eax, ecx                    # update magnitude mask
+        cdq
+        xor     eax, edx
+        or      r8, rax
+        inc     esi                         # increment sample index
+        cmp     esi, [rsp+8]
+        jnz     mono_high_loop              # loop back for all samples
+        jmp     mexit                       # then exit
+
+        .balign  64
+
+# hardcoded "very high" decorrelation loop
+
+mono_vhigh_loop:
+        mov     ecx, [rdi+rsi*4]             # ecx is the sample we're decorrelating
+
+        exeterm 18, 96
+        exeterm 18, 96
+        exeterm 2,  96
+        exeterm 3,  96
+        exeterm 1,  96
+        exeterm 18, 96
+        exeterm 2,  96
+        exeterm 4,  96
+        exeterm 7,  96
+        exeterm 5,  96
+        exeterm 3,  96
+        exeterm 6,  96
+        exeterm 8,  96
+        exeterm 1,  96
+        exeterm 18, 96
+        exeterm 2,  96*-15
+
+        mov     [rdi+rsi*4], ecx            # store completed sample
+        mov     eax, ecx                    # update magnitude mask
+        cdq
+        xor     eax, edx
+        or      r8, rax
+        inc     esi                         # increment sample index
+        cmp     esi, [rsp+8]
+        jnz     mono_vhigh_loop             # loop back for all samples
+        jmp     mexit                       # then exit
+
+        .endif                              # end of hardcoded filters configuration
+
+# when none of the hardcoded filters are applicable, or we aren't using them, fall through to here
+
+use_general_version:
+        mov     rbp, [rsp]                   # reload decorr_passes pointer to first term
+        imul    rax, rdx, 96
+        add     rax, rbp                     # r9 = terminating decorr_pass pointer
+        mov     r9, rax
+        jmp     decorrelate_loop
+
+        .balign  64
+
+decorrelate_loop:
+        mov     ecx, [rdi+rsi*4]             # ecx is the sample we're decorrelating
+nxterm: mov     edx, [rbp]
+        cmp     dl, 17
+        jge     3f
+
+        mov     eax, esi
+        and     eax, 7
+        mov     ebx, [rbp+16+rax*4]
+        add     eax, edx
+        and     eax, 7
+        mov     [rbp+16+rax*4], ecx
+        jmp     domult
+
+        .balign  4
+3:      mov     edx, [rbp+16]
+        mov     [rbp+16], ecx
+        je      4f
+        lea     ebx, [rdx+rdx*2]
+        sub     ebx, [rbp+20]
+        sar     ebx, 1
+        mov     [rbp+20], edx
+        jmp     domult
+
+        .balign  4
+4:      lea     ebx, [rdx+rdx]
+        sub     ebx, [rbp+20]
+        mov     [rbp+20], edx
+
+domult: mov     eax, [rbp+8]
+        mov     edx, eax
+        imul    eax, ebx
+        jo      multov                      # on overflow, jump to use 64-bit imul varient
+        sar     eax, 10
+        sbb     ecx, eax
+        je      2f
+        test    ebx, ebx
+        je      2f
+        xor     ebx, ecx
+        sar     ebx, 31
+        xor     edx, ebx
+        add     edx, [rbp+4]
+        xor     edx, ebx
+        mov     [rbp+8], edx
+2:      add     rbp, 96
+        cmp     rbp, r9
+        jnz     nxterm
+
+        mov     [rdi+rsi*4], ecx            # store completed sample
+        mov     eax, ecx                    # update magnitude mask
+        cdq
+        xor     eax, edx
+        or      r8, rax
+        mov     rbp, [rsp]                  # reload decorr_passes pointer to first term
+        inc     esi                         # increment sample index
+        cmp     esi, [rsp+8]
+        jnz     decorrelate_loop
+        jmp     mexit
+
+        .balign  4
+multov: mov     eax, [rbp+8]
+        imul    ebx
+        shr     eax, 10
+        sbb     ecx, eax
+        shl     edx, 22
+        sub     ecx, edx
+        je      2f
+        test    ebx, ebx
+        je      2f
+        xor     ebx, ecx
+        sar     ebx, 31
+        mov     eax, [rbp+8]
+        xor     eax, ebx
+        add     eax, [rbp+4]
+        xor     eax, ebx
+        mov     [rbp+8], eax
+2:      add     rbp, 96
+        cmp     rbp, r9
+        jnz     nxterm
+
+        mov     [rdi+rsi*4], ecx            # store completed sample
+        mov     eax, ecx                    # update magnitude mask
+        cdq
+        xor     eax, edx
+        or      r8, rax
+        mov     rbp, [rsp]                  # reload decorr_passes pointer to first term
+        inc     esi                         # increment sample index
+        cmp     esi, [rsp+8]
+        jnz     decorrelate_loop            # loop all the way back
+
+# common exit for entire function
+
+mexit:  mov     rax, r8                     # return max magnitude
+        add     rsp, 24
+        pop     rsi
+        pop     rdi
+        pop     rbx
+        pop     rbp
+        ret
+
+
+# This is an assembly optimized version of the following WavPack function:
+#
+# void decorr_mono_pass_cont (int32_t *out_buffer,
+#                             int32_t *in_buffer,
+#                             struct decorr_pass *dpp,
+#                             int32_t sample_count);
+#
+# It performs a single pass of mono decorrelation, transfering from the
+# input buffer to the output buffer. Note that this version of the function
+# requires that the up to 8 previous (depending on dpp->term) mono samples
+# are visible and correct. In other words, it ignores the "samples_*"
+# fields in the decorr_pass structure and gets the history data directly
+# from the source buffer. It does, however, return the appropriate history
+# samples to the decorr_pass structure before returning.
+#
+# By using the overflow detection of the multiply instruction, it detects
+# when the "long_math" varient is required and automatically does it.
+#
+# This version has entry points for both the System V ABI and the Windows
+# X64 ABI. It does not use the "red zone" or the "shadow area"; it saves the
+# non-volatile registers for both ABIs on the stack and allocates another
+# 8 bytes on the stack to store the dpp pointer. Note that it does NOT
+# provide unwind data for the Windows ABI (the pack_x64.asm module for
+# MSVC does). The arguments are passed in registers:
+#
+#                             System V  Windows  
+#   int32_t *out_buffer         rdi       rcx
+#   int32_t *in_buffer          rsi       rdx
+#   struct decorr_pass *dpp     rdx       r8
+#   int32_t sample_count        ecx       r9
+#
+# Stack usage:
+#
+# [rsp+0] = *dpp
+#
+# Register usage:
+#
+# rsi = source ptr
+# rdi = destination ptr
+# rcx = term * -4 (default terms)
+# rcx = previous sample (terms 17 & 18)
+# ebp = weight
+# r8d = delta
+# r9d = weight sum
+# r10 = eptr
+#
+
+_pack_decorr_mono_pass_cont_x64win:
+pack_decorr_mono_pass_cont_x64win:
+        push    rbp
+        push    rbx
+        push    rdi
+        push    rsi
+        sub     rsp, 8
+        mov     rdi, rcx                    # copy params from win regs to Linux regs
+        mov     rsi, rdx                    # so we can leave following code similar
+        mov     rdx, r8
+        mov     rcx, r9
+        jmp     menter
+
+_pack_decorr_mono_pass_cont_x64:
+pack_decorr_mono_pass_cont_x64:
+        push    rbp
+        push    rbx
+        push    rdi
+        push    rsi
+        sub     rsp, 8
+
+menter: mov     [rsp], rdx
+        and     ecx, ecx                    # test & handle zero sample count
+        jz      mono_done
+
+        cld
+        mov     r8d, [rdx+4]                # rd8 = delta
+        mov     ebp, [rdx+8]                # ebp = weight
+        mov     r9d, [rdx+88]               # r9d = weight sum
+        lea     r10, [rsi+rcx*4]            # r10 = eptr
+        mov     ecx, [rsi-4]                # preload last sample
+        mov     eax, [rdx]                  # get term
+        cmp     al, 17
+        je      mono_term_17_loop
+        cmp     al, 18
+        je      mono_term_18_loop
+
+        imul    rcx, rax, -4                # rcx is index to correlation sample
+        jmp     mono_default_term_loop
+
+        .balign  64
+
+mono_default_term_loop:
+        mov     edx, [rsi+rcx]
+        mov     ebx, edx
+        imul    edx, ebp
+        jo      1f
+        lodsd
+        sar     edx, 10
+        sbb     eax, edx
+        jmp     2f
+1:      mov     eax, ebx
+        imul    ebp
+        shl     edx, 22
+        shr     eax, 10
+        adc     edx, eax                    # edx = apply_weight (sam_A)
+        lodsd
+        sub     eax, edx
+2:      stosd
+        je      3f
+        test    ebx, ebx
+        je      3f
+        xor     eax, ebx
+        cdq
+        xor     ebp, edx
+        add     ebp, r8d
+        xor     ebp, edx
+3:      add     r9d, ebp
+        cmp     rsi, r10
+        jnz     mono_default_term_loop
+
+        mov     rdx, [rsp]                  # rdx = *dpp
+        mov     [rdx+8], ebp                # put weight back
+        mov     [rdx+88], r9d               # put weight sum back
+        movsxd  rcx, DWORD PTR [rdx]        # rcx = dpp->term
+
+mono_default_store_samples:
+        dec     rcx
+        sub     rsi, 4                      # back up one sample
+        mov     eax, [rsi]
+        mov     [rdx+rcx*4+16], eax         # store samples_A [ecx]
+        test    rcx, rcx
+        jnz     mono_default_store_samples
+        jmp     mono_done
+
+        .balign  64
+
+mono_term_17_loop:
+        lea     edx, [rcx+rcx]
+        sub     edx, [rsi-8]                # ebx = sam_A
+        mov     ebx, edx
+        imul    edx, ebp
+        jo      1f
+        sar     edx, 10
+        lodsd
+        mov     ecx, eax
+        sbb     eax, edx
+        jmp     2f
+1:      mov     eax, ebx
+        imul    ebp
+        shl     edx, 22
+        shr     eax, 10
+        adc     edx, eax                    # edx = apply_weight (sam_A)
+        lodsd
+        mov     ecx, eax
+        sub     eax, edx
+2:      stosd
+        je      3f
+        test    ebx, ebx
+        je      3f
+        xor     eax, ebx
+        cdq
+        xor     ebp, edx
+        add     ebp, r8d
+        xor     ebp, edx
+3:      add     r9d, ebp
+        cmp     rsi, r10
+        jnz     mono_term_17_loop
+        jmp     mono_term_1718_exit
+
+        .balign  64
+
+mono_term_18_loop:
+        lea     edx, [rcx+rcx*2]
+        sub     edx, [rsi-8]
+        sar     edx, 1
+        mov     ebx, edx                    # ebx = sam_A
+        imul    edx, ebp
+        jo      1f
+        sar     edx, 10
+        lodsd
+        mov     ecx, eax
+        sbb     eax, edx
+        jmp     2f
+1:      mov     eax, ebx
+        imul    ebp
+        shl     edx, 22
+        shr     eax, 10
+        adc     edx, eax                    # edx = apply_weight (sam_A)
+        lodsd
+        mov     ecx, eax
+        sub     eax, edx
+2:      stosd
+        je      3f
+        test    ebx, ebx
+        je      3f
+        xor     eax, ebx
+        cdq
+        xor     ebp, edx
+        add     ebp, r8d
+        xor     ebp, edx
+3:      add     r9d, ebp
+        cmp     rsi, r10
+        jnz     mono_term_18_loop
+
+mono_term_1718_exit:
+        mov     rdx, [rsp]                  # rdx = *dpp
+        mov     [rdx+8], ebp                # put weight back
+        mov     [rdx+88], r9d               # put weight sum back
+        mov     eax, [rsi-4]                # dpp->samples_A [0] = bptr [-1]
+        mov     [rdx+16], eax
+        mov     eax, [rsi-8]                # dpp->samples_A [1] = bptr [-2]
+        mov     [rdx+20], eax
+
+mono_done:
+        add     rsp, 8
+        pop     rsi
+        pop     rdi
+        pop     rbx
+        pop     rbp
+        ret
+
+
+# This is an assembly optimized version of the following WavPack function:
+#
+# uint32_t scan_max_magnitude (int32_t *buffer, int32_t sample_count);
+#
+# This function scans a buffer of signed 32-bit ints and returns the magnitude
+# of the largest sample, with a power-of-two resolution. It might be more
+# useful to return the actual maximum absolute value, but that implementation
+# would be slower. Instead, this simply returns the "or" of all the values
+# "xor"d with their own sign, like so:
+#
+#     while (sample_count--)
+#         magnitude |= (*buffer < 0) ? ~*buffer++ : *buffer++;
+#
+# This is written to work on an X86-64 processor (also called the AMD64)
+# running in 64-bit mode and uses the MMX extensions to improve the
+# performance by processing two samples together.
+#
+# This version has entry points for both the System V ABI and the Windows
+# X64 ABI. It does not use the "red zone" or the "shadow area"; it saves the
+# non-volatile registers for both ABIs on the stack and allocates another
+# 8 bytes on the stack so that it's properly aligned. Note that it does NOT
+# provide unwind data for the Windows ABI (the unpack_x64.asm module for
+# MSVC does). The arguments are passed in registers:
+#
+#                             System V  Windows
+#   int32_t *buffer             rdi       rcx
+#   int32_t sample_count        rsi       rdx
+#
+# During the processing loops, the following registers are used:
+#
+#   rdi         buffer pointer
+#   rsi         termination buffer pointer
+#   ebx         single magnitude accumulator
+#   mm0         dual magnitude accumulator
+#   mm1, mm2    scratch
+#
+
+_scan_max_magnitude_x64win:
+scan_max_magnitude_x64win:
+        push    rbp
+        push    rbx
+        push    rdi
+        push    rsi
+        sub     rsp, 8
+        mov     rdi, rcx                    # copy params from win regs to Linux regs
+        mov     rsi, rdx                    # so we can leave following code similar
+        mov     rdx, r8
+        mov     rcx, r9
+        jmp     senter
+
+_scan_max_magnitude_x64:
+scan_max_magnitude_x64:
+        push    rbp
+        push    rbx
+        push    rdi
+        push    rsi
+        sub     rsp, 8
+
+senter: xor     ebx, ebx                    # clear magnitude accumulator
+
+        mov     eax, esi                    # eax = count
+        and     eax, 7
+        mov     ecx, eax                    # ecx = leftover samples to "manually" scan at end
+
+        shr     esi, 3                      # esi = num of loops to process mmx (8 samples/loop)
+        shl     esi, 5                      # esi = num of bytes to process mmx (32 bytes/loop)
+        jz      nommx                       # jump around if no mmx loops to do (< 8 samples)
+
+        pxor    mm0, mm0                    # clear dual magnitude accumulator
+        add     rsi, rdi                    # rsi = termination buffer pointer for mmx loop
+        jmp     mmxlp
+
+        .balign  64
+
+mmxlp:  movq    mm1, [rdi]                  # get stereo samples in mm1 & mm2
+        movq    mm2, mm1
+        psrad   mm1, 31                     # mm1 = sign (mm2)
+        pxor    mm1, mm2                    # mm1 = absolute magnitude, or into result
+        por     mm0, mm1
+
+        movq    mm1, [rdi+8]                # do it again with 6 more samples
+        movq    mm2, mm1
+        psrad   mm1, 31
+        pxor    mm1, mm2
+        por     mm0, mm1
+
+        movq    mm1, [rdi+16]
+        movq    mm2, mm1
+        psrad   mm1, 31
+        pxor    mm1, mm2
+        por     mm0, mm1
+
+        movq    mm1, [rdi+24]
+        movq    mm2, mm1
+        psrad   mm1, 31
+        pxor    mm1, mm2
+        por     mm0, mm1
+
+        add     rdi, 32
+        cmp     rdi, rsi
+        jnz     mmxlp
+
+        movd    eax, mm0                    # ebx = "or" of high and low mm0
+        punpckhdq mm0, mm0
+        movd    ebx, mm0
+        or      ebx, eax
+        emms
+
+nommx:  and     ecx, ecx                    # any leftover samples to do?
+        jz      noleft
+
+leftlp: mov     eax, [rdi]
+        cdq
+        xor     eax, edx
+        or      ebx, eax
+        add     rdi, 4
+        loop    leftlp
+
+noleft: mov     eax, ebx                    # move magnitude to eax for return
+        add     rsp, 8
+        pop     rsi
+        pop     rdi
+        pop     rbx
+        pop     rbp
+        ret
+
+
+# This is an assembly optimized version of the following WavPack function:
+#
+# uint32_t log2buffer (int32_t *samples, uint32_t num_samples, int limit);
+#
+# This function scans a buffer of 32-bit ints and accumulates the total
+# log2 value of all the samples. This is useful for determining maximum
+# compression because the bitstream storage required for entropy coding
+# is proportional to the base 2 log of the samples.
+#
+# This is written to work on an X86-64 processor (also called the AMD64)
+# running in 64-bit mode. This version has entry points for both the System
+# V ABI and the Windows X64 ABI. It does not use the "red zone" or the
+# "shadow area"; it saves the non-volatile registers for both ABIs on the
+# stack and allocates another 8 bytes on the stack so it's aligned properly.
+# Note that it does NOT provide unwind data for the Windows ABI (but the
+# unpack_x64.asm module for MSVC does). The arguments are passed in registers:
+#
+#                             System V  Windows  
+#   int32_t *samples            rdi       rcx
+#   uint32_t num_samples        esi       rdx
+#   int limit                   edx       r8
+#
+# During the processing loops, the following registers are used:
+#
+#   r8              pointer to the 256-byte log fraction table
+#   rsi             input buffer pointer
+#   edi             sum accumulator
+#   ebx             sample count
+#   ebp             limit (if specified non-zero)
+#   eax,ecx,edx     scratch
+#
+
+        .balign  256
+
+log2_table:
+        .byte   0x00, 0x01, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0b, 0x0d, 0x0e, 0x10, 0x11, 0x12, 0x14, 0x15
+        .byte   0x16, 0x18, 0x19, 0x1a, 0x1c, 0x1d, 0x1e, 0x20, 0x21, 0x22, 0x24, 0x25, 0x26, 0x28, 0x29, 0x2a
+        .byte   0x2c, 0x2d, 0x2e, 0x2f, 0x31, 0x32, 0x33, 0x34, 0x36, 0x37, 0x38, 0x39, 0x3b, 0x3c, 0x3d, 0x3e
+        .byte   0x3f, 0x41, 0x42, 0x43, 0x44, 0x45, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4d, 0x4e, 0x4f, 0x50, 0x51
+        .byte   0x52, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63
+        .byte   0x64, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x74, 0x75
+        .byte   0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85
+        .byte   0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95
+        .byte   0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4
+        .byte   0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb2
+        .byte   0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc0
+        .byte   0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcb, 0xcc, 0xcd, 0xce
+        .byte   0xcf, 0xd0, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd8, 0xd9, 0xda, 0xdb
+        .byte   0xdc, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe4, 0xe5, 0xe6, 0xe7, 0xe7
+        .byte   0xe8, 0xe9, 0xea, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xee, 0xef, 0xf0, 0xf1, 0xf1, 0xf2, 0xf3, 0xf4
+        .byte   0xf4, 0xf5, 0xf6, 0xf7, 0xf7, 0xf8, 0xf9, 0xf9, 0xfa, 0xfb, 0xfc, 0xfc, 0xfd, 0xfe, 0xff, 0xff
+
+_log2buffer_x64win:
+log2buffer_x64win:
+        push    rbp
+        push    rbx
+        push    rdi
+        push    rsi
+        sub     rsp, 8
+        mov     rdi, rcx                    # copy params from win regs to Linux regs
+        mov     rsi, rdx                    # so we can leave following code similar
+        mov     rdx, r8
+        mov     rcx, r9
+        jmp     log2bf
+
+_log2buffer_x64:
+log2buffer_x64:
+        push    rbp
+        push    rbx
+        push    rdi
+        push    rsi
+        sub     rsp, 8
+
+log2bf: mov     ebx, esi                    # ebx = num_samples
+        mov     rsi, rdi                    # rsi = *samples
+        xor     edi, edi                    # initialize sum
+        lea     r8, [log2_table+rip]
+        test    ebx, ebx                    # test count for zero
+        jz      normal_exit
+        mov     ebp, edx                    # ebp = limit
+        test    ebp, ebp                    # we have separate loops for limit and no limit
+        jz      no_limit_loop
+        jmp     limit_loop
+
+        .balign  64
+
+limit_loop:
+        mov     eax, [rsi]                  # get next sample into eax
+        cdq                                 # edx = sign of sample (for abs)
+        add     rsi, 4
+        xor     eax, edx
+        sub     eax, edx
+        je      L40                         # skip if sample was zero
+        mov     edx, eax                    # move to edx and apply rounding
+        shr     eax, 9
+        add     edx, eax
+        bsr     ecx, edx                    # ecx = MSB set in sample (0 - 31)
+        lea     eax, [ecx+1]                # eax = number used bits in sample (1 - 32)
+        sub     ecx, 8                      # ecx = shift right amount (-8 to 23)
+        ror     edx, cl                     # use rotate to do "signed" shift 
+        shl     eax, 8                      # move nbits to integer portion of log
+        movzx   edx, dl                     # dl = mantissa, look up log fraction in table 
+        mov     al, [r8+rdx]                # eax = combined integer and fraction for full log
+        add     edi, eax                    # add to running sum and compare to limit
+        cmp     eax, ebp
+        jge     limit_exceeded
+L40:    sub     ebx, 1                      # loop back if more samples
+        jne     limit_loop
+        jmp     normal_exit
+
+        .balign  64
+
+no_limit_loop:
+        mov     eax, [rsi]                  # get next sample into eax
+        cdq                                 # edx = sign of sample (for abs)
+        add     rsi, 4
+        xor     eax, edx
+        sub     eax, edx
+        je      L45                         # skip if sample was zero
+        mov     edx, eax                    # move to edx and apply rounding
+        shr     eax, 9
+        add     edx, eax
+        bsr     ecx, edx                    # ecx = MSB set in sample (0 - 31)
+        lea     eax, [ecx+1]                # eax = number used bits in sample (1 - 32)
+        sub     ecx, 8                      # ecx = shift right amount (-8 to 23)
+        ror     edx, cl                     # use rotate to do "signed" shift 
+        shl     eax, 8                      # move nbits to integer portion of log
+        movzx   edx, dl                     # dl = mantissa, look up log fraction in table 
+        mov     al, [r8+rdx]                # eax = combined integer and fraction for full log
+        add     edi, eax                    # add to running sum
+L45:    sub     ebx, 1
+        jne     no_limit_loop
+        jmp     normal_exit
+
+limit_exceeded:
+        mov     edi, -1                     # return -1 to indicate limit hit
+normal_exit:
+        mov     eax, edi                    # move sum accumulator into eax for return
+        add     rsp, 8
+        pop     rsi
+        pop     rdi
+        pop     rbx
+        pop     rbp
+        ret
+
+#ifdef __ELF__
+        .section .note.GNU-stack,"",@progbits
+#endif
+
diff --git a/third_party/wavpack/src/pack_x64.asm b/third_party/wavpack/src/pack_x64.asm
new file mode 100644
index 0000000..2a4b551
--- /dev/null
+++ b/third_party/wavpack/src/pack_x64.asm
@@ -0,0 +1,1852 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;                           **** WAVPACK ****                            ;;
+;;                  Hybrid Lossless Wavefile Compressor                   ;;
+;;              Copyright (c) 1998 - 2015 Conifer Software.               ;;
+;;                          All Rights Reserved.                          ;;
+;;      Distributed under the BSD Software License (see license.txt)      ;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+        include <ksamd64.inc>
+
+        public  pack_decorr_stereo_pass_cont_rev_x64win
+        public  pack_decorr_stereo_pass_cont_x64win
+
+asmcode segment page 'CODE'
+
+; This module contains X64 assembly optimized versions of functions required
+; to encode WavPack files.
+
+; This is an assembly optimized version of the following WavPack function:
+;
+; void pack_decorr_stereo_pass (
+;   struct decorr_pass *dpp,
+;   int32_t *buffer,
+;   int32_t sample_count);
+;
+; It performs a single pass of stereo decorrelation, in place, as specified
+; by the decorr_pass structure. Note that this function does NOT return the
+; dpp->samples_X[] values in the "normalized" positions for terms 1-8, so if
+; the number of samples is not a multiple of MAX_TERM, these must be moved if
+; they are to be used somewhere else.
+;
+; This is written to work on an X86-64 processor (also called the AMD64)
+; running in 64-bit mode and uses the MMX extensions to improve the
+; performance by processing both stereo channels together. It is based on
+; the original MMX code written by Joachim Henke that used MMX intrinsics
+; called from C. Many thanks to Joachim for that!
+;
+; An issue with using MMX for this is that the sample history array in the
+; decorr_pass structure contains separate arrays for each channel while the
+; MMX code wants there to be a single array of dual samples. The fix for
+; this is to convert the data in the arrays on entry and exit, and this is
+; made easy by the fact that the 8 MMX regsiters hold exactly the required
+; amount of data (64 bytes)!
+;
+; This is written to work on an X86-64 processor (also called the AMD64)
+; running in 64-bit mode. This version is for the 64-bit Windows ABI and
+; provides appropriate prologs and epilogs for stack unwinding. The
+; arguments are passed in registers:
+;
+;   struct decorr_pass *dpp       rcx
+;   int32_t *buffer               rdx
+;   int32_t sample_count          r8d
+;
+; During the processing loops, the following registers are used:
+;
+;   rdi         buffer pointer
+;   rsi         termination buffer pointer
+;   rax,rbx,rdx used in default term to reduce calculation         
+;   rbp         decorr_pass pointer
+;   mm0, mm1    scratch
+;   mm2         original sample values
+;   mm3         correlation samples
+;   mm4         0 (for pcmpeqd)
+;   mm5         weights
+;   mm6         delta
+;   mm7         512 (for rounding)
+;
+
+pack_decorr_stereo_pass_x64win proc frame
+        push_reg    rbp                     ; save non-volatile registers on stack
+        push_reg    rbx                     ; (alphabetically)
+        push_reg    rdi
+        push_reg    rsi
+        alloc_stack 8                       ; allocate 8 bytes on stack & align to 16 bytes
+        end_prologue
+
+        mov     rdi, rcx                    ; copy params from win regs to Linux regs
+        mov     rsi, rdx                    ; so we can leave following code similar
+        mov     rdx, r8
+        mov     rcx, r9
+
+        mov     rbp, rdi                    ; rbp = *dpp
+        mov     rdi, rsi                    ; rdi = inbuffer
+        mov     esi, edx
+        sal     esi, 3
+        jz      bdone
+        add     rsi, rdi                    ; rsi = termination buffer pointer
+
+        ; convert samples_A and samples_B array into samples_AB array for MMX
+        ; (the MMX registers provide exactly enough storage to do this easily)
+
+        movq        mm0, [rbp+16]
+        punpckldq   mm0, [rbp+48]
+        movq        mm1, [rbp+16]
+        punpckhdq   mm1, [rbp+48]
+        movq        mm2, [rbp+24]
+        punpckldq   mm2, [rbp+56]
+        movq        mm3, [rbp+24]
+        punpckhdq   mm3, [rbp+56]
+        movq        mm4, [rbp+32]
+        punpckldq   mm4, [rbp+64]
+        movq        mm5, [rbp+32]
+        punpckhdq   mm5, [rbp+64]
+        movq        mm6, [rbp+40]
+        punpckldq   mm6, [rbp+72]
+        movq        mm7, [rbp+40]
+        punpckhdq   mm7, [rbp+72]
+
+        movq    [rbp+16], mm0
+        movq    [rbp+24], mm1
+        movq    [rbp+32], mm2
+        movq    [rbp+40], mm3
+        movq    [rbp+48], mm4
+        movq    [rbp+56], mm5
+        movq    [rbp+64], mm6
+        movq    [rbp+72], mm7
+
+        mov     eax, 512
+        movd    mm7, eax
+        punpckldq mm7, mm7                  ; mm7 = round (512)
+
+        mov     eax, [rbp+4]
+        movd    mm6, eax
+        punpckldq mm6, mm6                  ; mm6 = delta (0-7)
+
+        mov     eax, 0FFFFh                 ; mask high weights to zero for PMADDWD
+        movd    mm5, eax
+        punpckldq mm5, mm5                  ; mm5 = weight mask 0x0000FFFF0000FFFF
+        pand    mm5, [rbp+8]                ; mm5 = weight_AB masked to 16-bit
+
+        movq    mm4, [rbp+16]               ; preload samples_AB[0]
+
+        mov     al, [rbp]                   ; get term and vector to correct loop
+        cmp     al, 17
+        je      buff_term_17_loop
+        cmp     al, 18
+        je      buff_term_18_loop
+        cmp     al, -1
+        je      buff_term_minus_1_loop
+        cmp     al, -2
+        je      buff_term_minus_2_loop
+        cmp     al, -3
+        je      buff_term_minus_3_loop
+
+        pxor    mm4, mm4                    ; mm4 = 0 (for pcmpeqd)
+        xor     eax, eax
+        xor     ebx, ebx
+        add     bl, [rbp]
+        mov     ecx, 7
+        and     ebx, ecx
+        jmp     buff_default_term_loop
+
+        align  64
+
+buff_default_term_loop:
+        movq    mm2, [rdi]                  ; mm2 = left_right
+        movq    mm3, [rbp+16+rax*8]
+        inc     eax
+        and     eax, ecx
+        movq    [rbp+16+rbx*8], mm2
+        inc     ebx
+        and     ebx, ecx
+
+        movq    mm1, mm3
+        paddd   mm1, mm1
+        psrlw   mm1, 1
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        psrld   mm0, 15
+        pmaddwd mm0, mm5
+
+        pslld   mm0, 5
+        paddd   mm1, mm7                    ; add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    ; add shifted sums
+        movq    mm0, mm3
+        movq    [rdi], mm2                  ; store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
+        add     rdi, 8
+        pcmpeqd mm2, mm4                    ; mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm4                    ; mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    ; mm2 = 1s if either was zero
+        pandn   mm2, mm6                    ; mask delta with zeros check
+        pxor    mm5, mm0
+        paddw   mm5, mm2                    ; and add to weight_AB
+        pxor    mm5, mm0
+        cmp     rdi, rsi
+        jnz     buff_default_term_loop
+
+        jmp     bdone
+
+        align  64
+
+buff_term_17_loop:
+        movq    mm3, mm4                    ; get previous calculated value
+        paddd   mm3, mm4
+        psubd   mm3, [rbp+24]
+        movq    [rbp+24], mm4
+
+        movq    mm1, mm3
+        paddd   mm1, mm1
+        psrlw   mm1, 1
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        psrld   mm0, 15
+        pmaddwd mm0, mm5
+
+        movq    mm2, [rdi]                  ; mm2 = left_right
+        movq    mm4, mm2
+        pslld   mm0, 5
+        paddd   mm1, mm7                    ; add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    ; add shifted sums
+        movq    mm0, mm3
+        movq    [rdi], mm2                  ; store result
+        pxor    mm1, mm1
+        pxor    mm0, mm2
+        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
+        add     rdi, 8
+        pcmpeqd mm2, mm1                    ; mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    ; mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    ; mm2 = 1s if either was zero
+        pandn   mm2, mm6                    ; mask delta with zeros check
+        pxor    mm5, mm0
+        paddw   mm5, mm2                    ; and add to weight_AB
+        pxor    mm5, mm0
+        cmp     rdi, rsi
+        jnz     buff_term_17_loop
+
+        movq    [rbp+16], mm4               ; post-store samples_AB[0]
+        jmp     bdone
+
+        align  64
+
+buff_term_18_loop:
+        movq    mm3, mm4                    ; get previous calculated value
+        psubd   mm3, [rbp+24]
+        psrad   mm3, 1
+        paddd   mm3, mm4                    ; mm3 = sam_AB
+        movq    [rbp+24], mm4
+
+        movq    mm1, mm3
+        paddd   mm1, mm1
+        psrlw   mm1, 1
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        psrld   mm0, 15
+        pmaddwd mm0, mm5
+
+        movq    mm2, [rdi]                  ; mm2 = left_right
+        movq    mm4, mm2
+        pslld   mm0, 5
+        paddd   mm1, mm7                    ; add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    ; add shifted sums
+        movq    mm0, mm3
+        movq    [rdi], mm2                  ; store result
+        pxor    mm1, mm1
+        pxor    mm0, mm2
+        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
+        add     rdi, 8
+        pcmpeqd mm2, mm1                    ; mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    ; mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    ; mm2 = 1s if either was zero
+        pandn   mm2, mm6                    ; mask delta with zeros check
+        pxor    mm5, mm0
+        paddw   mm5, mm2                    ; and add to weight_AB
+        pxor    mm5, mm0
+        cmp     rdi, rsi
+        jnz     buff_term_18_loop
+
+        movq    [rbp+16], mm4               ; post-store samples_AB[0]
+        jmp     bdone
+
+        align  64
+
+buff_term_minus_1_loop:
+        movq    mm3, mm4                    ; mm3 = previous calculated value
+        movq    mm2, [rdi]                  ; mm2 = left_right
+        movq    mm4, mm2
+        psrlq   mm4, 32
+        punpckldq mm3, mm2                  ; mm3 = sam_AB
+
+        movq    mm1, mm3
+        paddd   mm1, mm1
+        psrlw   mm1, 1
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        psrld   mm0, 15
+        pmaddwd mm0, mm5
+
+        pslld   mm0, 5
+        paddd   mm1, mm7                    ; add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    ; add shifted sums
+        movq    mm0, mm3
+        movq    [rdi], mm2                  ; store result
+        pxor    mm1, mm1
+        pxor    mm0, mm2
+        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
+        add     rdi, 8
+        pcmpeqd mm2, mm1                    ; mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    ; mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    ; mm2 = 1s if either was zero
+        pandn   mm2, mm6                    ; mask delta with zeros check
+        pcmpeqd mm1, mm1
+        psubd   mm1, mm7
+        psubd   mm1, mm7
+        psubd   mm1, mm0
+        pxor    mm5, mm0
+        paddw   mm5, mm1
+        paddusw mm5, mm2                    ; and add to weight_AB
+        psubw   mm5, mm1
+        pxor    mm5, mm0
+        cmp     rdi, rsi
+        jnz     buff_term_minus_1_loop
+
+        movq    [rbp+16], mm4               ; post-store samples_AB[0]
+        jmp     bdone
+
+        align  64
+
+buff_term_minus_2_loop:
+        movq    mm2, [rdi]                  ; mm2 = left_right
+        movq    mm3, mm2
+        psrlq   mm3, 32
+        por     mm3, mm4
+        punpckldq mm4, mm2
+
+        movq    mm1, mm3
+        paddd   mm1, mm1
+        psrlw   mm1, 1
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        psrld   mm0, 15
+        pmaddwd mm0, mm5
+
+        pslld   mm0, 5
+        paddd   mm1, mm7                    ; add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    ; add shifted sums
+        movq    mm0, mm3
+        movq    [rdi], mm2                  ; store result
+        pxor    mm1, mm1
+        pxor    mm0, mm2
+        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
+        add     rdi, 8
+        pcmpeqd mm2, mm1                    ; mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    ; mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    ; mm2 = 1s if either was zero
+        pandn   mm2, mm6                    ; mask delta with zeros check
+        pcmpeqd mm1, mm1
+        psubd   mm1, mm7
+        psubd   mm1, mm7
+        psubd   mm1, mm0
+        pxor    mm5, mm0
+        paddw   mm5, mm1
+        paddusw mm5, mm2                    ; and add to weight_AB
+        psubw   mm5, mm1
+        pxor    mm5, mm0
+        cmp     rdi, rsi
+        jnz     buff_term_minus_2_loop
+
+        movq    [rbp+16], mm4               ; post-store samples_AB[0]
+        jmp     bdone
+
+        align  64
+
+buff_term_minus_3_loop:
+        movq    mm2, [rdi]                  ; mm2 = left_right
+        movq    mm3, mm4                    ; mm3 = previous calculated value
+        movq    mm4, mm2                    ; mm0 = swap dwords of new data
+        psrlq   mm4, 32
+        punpckldq mm4, mm2                  ; mm3 = sam_AB
+
+        movq    mm1, mm3
+        paddd   mm1, mm1
+        psrlw   mm1, 1
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        psrld   mm0, 15
+        pmaddwd mm0, mm5
+
+        pslld   mm0, 5
+        paddd   mm1, mm7                    ; add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    ; add shifted sums
+        movq    mm0, mm3
+        movq    [rdi], mm2                  ; store result
+        pxor    mm1, mm1
+        pxor    mm0, mm2
+        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
+        add     rdi, 8
+        pcmpeqd mm2, mm1                    ; mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    ; mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    ; mm2 = 1s if either was zero
+        pandn   mm2, mm6                    ; mask delta with zeros check
+        pcmpeqd mm1, mm1
+        psubd   mm1, mm7
+        psubd   mm1, mm7
+        psubd   mm1, mm0
+        pxor    mm5, mm0
+        paddw   mm5, mm1
+        paddusw mm5, mm2                    ; and add to weight_AB
+        psubw   mm5, mm1
+        pxor    mm5, mm0
+        cmp     rdi, rsi
+        jnz     buff_term_minus_3_loop
+
+        movq    [rbp+16], mm4               ; post-store samples_AB[0]
+
+bdone:  pslld   mm5, 16                     ; sign-extend 16-bit weights back to dwords
+        psrad   mm5, 16
+        movq    [rbp+8], mm5                ; put weight_AB back
+
+        ; convert samples_AB array back into samples_A and samples_B
+
+        movq    mm0, [rbp+16]
+        movq    mm1, [rbp+24]
+        movq    mm2, [rbp+32]
+        movq    mm3, [rbp+40]
+        movq    mm4, [rbp+48]
+        movq    mm5, [rbp+56]
+        movq    mm6, [rbp+64]
+        movq    mm7, [rbp+72]
+
+        movd    DWORD PTR [rbp+16], mm0
+        movd    DWORD PTR [rbp+20], mm1
+        movd    DWORD PTR [rbp+24], mm2
+        movd    DWORD PTR [rbp+28], mm3
+        movd    DWORD PTR [rbp+32], mm4
+        movd    DWORD PTR [rbp+36], mm5
+        movd    DWORD PTR [rbp+40], mm6
+        movd    DWORD PTR [rbp+44], mm7
+
+        punpckhdq   mm0, mm0
+        punpckhdq   mm1, mm1
+        punpckhdq   mm2, mm2
+        punpckhdq   mm3, mm3
+        punpckhdq   mm4, mm4
+        punpckhdq   mm5, mm5
+        punpckhdq   mm6, mm6
+        punpckhdq   mm7, mm7
+
+        movd    DWORD PTR [rbp+48], mm0
+        movd    DWORD PTR [rbp+52], mm1
+        movd    DWORD PTR [rbp+56], mm2
+        movd    DWORD PTR [rbp+60], mm3
+        movd    DWORD PTR [rbp+64], mm4
+        movd    DWORD PTR [rbp+68], mm5
+        movd    DWORD PTR [rbp+72], mm6
+        movd    DWORD PTR [rbp+76], mm7
+
+        emms
+
+        add     rsp, 8
+        pop     rsi
+        pop     rdi
+        pop     rbx
+        pop     rbp
+        ret
+
+pack_decorr_stereo_pass_x64win endp
+
+; These are assembly optimized version of the following WavPack functions:
+;
+; void pack_decorr_stereo_pass_cont (
+;   struct decorr_pass *dpp,
+;   int32_t *in_buffer,
+;   int32_t *out_buffer,
+;   int32_t sample_count);
+;
+; void pack_decorr_stereo_pass_cont_rev (
+;   struct decorr_pass *dpp,
+;   int32_t *in_buffer,
+;   int32_t *out_buffer,
+;   int32_t sample_count);
+;
+; It performs a single pass of stereo decorrelation, transfering from the
+; input buffer to the output buffer. Note that this version of the function
+; requires that the up to 8 previous (depending on dpp->term) stereo samples
+; are visible and correct. In other words, it ignores the "samples_*"
+; fields in the decorr_pass structure and gets the history data directly
+; from the source buffer. It does, however, return the appropriate history
+; samples to the decorr_pass structure before returning.
+;
+; This is written to work on an X86-64 processor (also called the AMD64)
+; running in 64-bit mode and uses the MMX extensions to improve the
+; performance by processing both stereo channels together. It is based on
+; the original MMX code written by Joachim Henke that used MMX intrinsics
+; called from C. Many thanks to Joachim for that!
+;
+; This version is for 64-bit Windows. Note that the two public functions
+; are "leaf" functions that simply load rax with the direction and jump
+; into the private common "frame" function. The arguments are passed in
+; registers:
+;
+;   struct decorr_pass *dpp     rcx
+;   int32_t *in_buffer          rdx
+;   int32_t *out_buffer         r8
+;   int32_t sample_count        r9d
+;
+; During the processing loops, the following registers are used:
+;
+;   rdi         input buffer pointer
+;   rsi         direction (-8 forward, +8 reverse)
+;   rbx         delta from input to output buffer
+;   ecx         sample count
+;   rdx         sign (dir) * term * -8 (terms 1-8 only)
+;   mm0, mm1    scratch
+;   mm2         original sample values
+;   mm3         correlation samples
+;   mm4         weight sums
+;   mm5         weights
+;   mm6         delta
+;   mm7         512 (for rounding)
+;
+; stack usage:
+;
+; [rsp+0] = *dpp
+;
+
+pack_decorr_stereo_pass_cont_rev_x64win:
+        mov     rax, 8                      ; get value for reverse direction & jump
+        jmp     pack_decorr_stereo_pass_cont_common
+
+pack_decorr_stereo_pass_cont_x64win:
+        mov     rax, -8                     ; get value for forward direction & jump
+        jmp     pack_decorr_stereo_pass_cont_common
+
+pack_decorr_stereo_pass_cont_common proc frame
+        push_reg    rbp                     ; save non-volatile registers on stack
+        push_reg    rbx                     ; (alphabetically)
+        push_reg    rdi
+        push_reg    rsi
+        alloc_stack 8                       ; allocate 8 bytes on stack & align to 16 bytes
+        end_prologue
+
+        mov     [rsp], rcx                  ; [rsp] = *dpp
+        mov     rdi, rcx                    ; copy params from win regs to Linux regs
+        mov     rsi, rdx                    ; so we can leave following code similar
+        mov     rdx, r8
+        mov     rcx, r9
+
+        mov     rdi, rsi                    ; rdi = inbuffer
+        mov     rsi, rax                    ; rsi = -direction
+
+        mov     eax, 512
+        movd    mm7, eax
+        punpckldq mm7, mm7                  ; mm7 = round (512)
+
+        mov     rax, [rsp]                  ; access dpp
+        mov     eax, [rax+4]
+        movd    mm6, eax
+        punpckldq mm6, mm6                  ; mm6 = delta (0-7)
+
+        mov     rax, [rsp]                  ; access dpp
+        movq    mm5, [rax+8]                ; mm5 = weight_AB
+        movq    mm4, [rax+88]               ; mm4 = sum_AB
+
+        mov     rbx, rdx                    ; rbx = out_buffer (rdx) - in_buffer (rdi)
+        sub     rbx, rdi
+
+        mov     rax, [rsp]                  ; *eax = dpp
+        movsxd  rax, DWORD PTR [rax]        ; get term and vector to correct loop
+        cmp     al, 17
+        je      term_17_loop
+        cmp     al, 18
+        je      term_18_loop
+        cmp     al, -1
+        je      term_minus_1_loop
+        cmp     al, -2
+        je      term_minus_2_loop
+        cmp     al, -3
+        je      term_minus_3_loop
+
+        sal     rax, 3
+        mov     rdx, rax                    ; rdx = term * 8 to index correlation sample
+        test    rsi, rsi                    ; test direction
+        jns     default_term_loop
+        neg     rdx
+        jmp     default_term_loop
+
+        align  64
+
+default_term_loop:
+        movq    mm3, [rdi+rdx]              ; mm3 = sam_AB
+
+        movq    mm1, mm3
+        pslld   mm1, 17
+        psrld   mm1, 17
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        pslld   mm0, 1
+        psrld   mm0, 16
+        pmaddwd mm0, mm5
+
+        movq    mm2, [rdi]                  ; mm2 = left_right
+        pslld   mm0, 5
+        paddd   mm1, mm7                    ; add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    ; add shifted sums
+        movq    mm0, mm3
+        movq    [rdi+rbx], mm2              ; store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
+        sub     rdi, rsi
+        pxor    mm1, mm1                    ; mm1 = zero
+        pcmpeqd mm2, mm1                    ; mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    ; mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    ; mm2 = 1s if either was zero
+        pandn   mm2, mm6                    ; mask delta with zeros check
+        pxor    mm5, mm0
+        paddd   mm5, mm2                    ; and add to weight_AB
+        pxor    mm5, mm0
+        paddd   mm4, mm5                    ; add weights to sum
+        dec     ecx
+        jnz     default_term_loop
+
+        mov     rax, [rsp]                  ; access dpp
+        movq    [rax+8], mm5                ; put weight_AB back
+        movq    [rax+88], mm4               ; put sum_AB back
+        emms
+
+        mov     rdx, [rsp]                  ; access dpp with rdx
+        movsxd  rcx, DWORD PTR [rdx]        ; rcx = dpp->term
+
+default_store_samples:
+        dec     rcx
+        add     rdi, rsi                    ; back up one full sample
+        mov     eax, [rdi+4]
+        mov     [rdx+rcx*4+48], eax         ; store samples_B [ecx]
+        mov     eax, [rdi]
+        mov     [rdx+rcx*4+16], eax         ; store samples_A [ecx]
+        test    rcx, rcx
+        jnz     default_store_samples
+        jmp     done
+
+        align  64
+
+term_17_loop:
+        movq    mm3, [rdi+rsi]              ; get previous calculated value
+        paddd   mm3, mm3
+        psubd   mm3, [rdi+rsi*2]
+
+        movq    mm1, mm3
+        pslld   mm1, 17
+        psrld   mm1, 17
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        pslld   mm0, 1
+        psrld   mm0, 16
+        pmaddwd mm0, mm5
+
+        movq    mm2, [rdi]                  ; mm2 = left_right
+        pslld   mm0, 5
+        paddd   mm1, mm7                    ; add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    ; add shifted sums
+        movq    mm0, mm3
+        movq    [rdi+rbx], mm2              ; store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
+        sub     rdi, rsi
+        pxor    mm1, mm1                    ; mm1 = zero
+        pcmpeqd mm2, mm1                    ; mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    ; mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    ; mm2 = 1s if either was zero
+        pandn   mm2, mm6                    ; mask delta with zeros check
+        pxor    mm5, mm0
+        paddd   mm5, mm2                    ; and add to weight_AB
+        pxor    mm5, mm0
+        paddd   mm4, mm5                    ; add weights to sum
+        dec     ecx
+        jnz     term_17_loop
+
+        mov     rax, [rsp]                  ; access dpp
+        movq    [rax+8], mm5                ; put weight_AB back
+        movq    [rax+88], mm4               ; put sum_AB back
+        emms
+        jmp     term_1718_common_store
+
+        align  64
+
+term_18_loop:
+        movq    mm3, [rdi+rsi]              ; get previous calculated value
+        movq    mm0, mm3
+        psubd   mm3, [rdi+rsi*2]
+        psrad   mm3, 1
+        paddd   mm3, mm0                    ; mm3 = sam_AB
+
+        movq    mm1, mm3
+        pslld   mm1, 17
+        psrld   mm1, 17
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        pslld   mm0, 1
+        psrld   mm0, 16
+        pmaddwd mm0, mm5
+
+        movq    mm2, [rdi]                  ; mm2 = left_right
+        pslld   mm0, 5
+        paddd   mm1, mm7                    ; add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    ; add shifted sums
+        movq    mm0, mm3
+        movq    [rdi+rbx], mm2              ; store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
+        sub     rdi, rsi
+        pxor    mm1, mm1                    ; mm1 = zero
+        pcmpeqd mm2, mm1                    ; mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    ; mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    ; mm2 = 1s if either was zero
+        pandn   mm2, mm6                    ; mask delta with zeros check
+        pxor    mm5, mm0
+        paddd   mm5, mm2                    ; and add to weight_AB
+        pxor    mm5, mm0
+        dec     ecx
+        paddd   mm4, mm5                    ; add weights to sum
+        jnz     term_18_loop
+
+        mov     rax, [rsp]                  ; access dpp
+        movq    [rax+8], mm5                ; put weight_AB back
+        movq    [rax+88], mm4               ; put sum_AB back
+        emms
+
+term_1718_common_store:
+
+        mov     rax, [rsp]                  ; access dpp
+        add     rdi, rsi                    ; back up a full sample
+        mov     edx, [rdi+4]                ; dpp->samples_B [0] = iptr [-1];
+        mov     [rax+48], edx
+        mov     edx, [rdi]                  ; dpp->samples_A [0] = iptr [-2];
+        mov     [rax+16], edx
+        add     rdi, rsi                    ; back up another sample
+        mov     edx, [rdi+4]                ; dpp->samples_B [1] = iptr [-3];
+        mov     [rax+52], edx
+        mov     edx, [rdi]                  ; dpp->samples_A [1] = iptr [-4];
+        mov     [rax+20], edx
+        jmp     done
+
+        align  64
+
+term_minus_1_loop:
+        movq    mm3, [rdi+rsi]              ; mm3 = previous calculated value
+        movq    mm2, [rdi]                  ; mm2 = left_right
+        psrlq   mm3, 32
+        punpckldq mm3, mm2                  ; mm3 = sam_AB
+
+        movq    mm1, mm3
+        pslld   mm1, 17
+        psrld   mm1, 17
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        pslld   mm0, 1
+        psrld   mm0, 16
+        pmaddwd mm0, mm5
+
+        pslld   mm0, 5
+        paddd   mm1, mm7                    ; add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    ; add shifted sums
+        movq    mm0, mm3
+        movq    [rdi+rbx], mm2              ; store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
+        sub     rdi, rsi
+        pxor    mm1, mm1                    ; mm1 = zero
+        pcmpeqd mm2, mm1                    ; mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    ; mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    ; mm2 = 1s if either was zero
+        pandn   mm2, mm6                    ; mask delta with zeros check
+        pcmpeqd mm1, mm1
+        psubd   mm1, mm7
+        psubd   mm1, mm7
+        psubd   mm1, mm0
+        pxor    mm5, mm0
+        paddd   mm5, mm1
+        paddusw mm5, mm2                    ; and add to weight_AB
+        psubd   mm5, mm1
+        pxor    mm5, mm0
+        paddd   mm4, mm5                    ; add weights to sum
+        dec     ecx
+        jnz     term_minus_1_loop
+
+        mov     rax, [rsp]                  ; access dpp
+        movq    [rax+8], mm5                ; put weight_AB back
+        movq    [rax+88], mm4               ; put sum_AB back
+        emms
+
+        add     rdi, rsi                    ; back up a full sample
+        mov     edx, [rdi+4]                ; dpp->samples_A [0] = iptr [-1];
+        mov     rax, [rsp]
+        mov     [rax+16], edx
+        jmp     done
+
+        align  64
+
+term_minus_2_loop:
+        movq    mm2, [rdi]                  ; mm2 = left_right
+        movq    mm3, mm2                    ; mm3 = swap dwords
+        psrlq   mm3, 32
+        punpckldq mm3, [rdi+rsi]            ; mm3 = sam_AB
+
+        movq    mm1, mm3
+        pslld   mm1, 17
+        psrld   mm1, 17
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        pslld   mm0, 1
+        psrld   mm0, 16
+        pmaddwd mm0, mm5
+
+        pslld   mm0, 5
+        paddd   mm1, mm7                    ; add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    ; add shifted sums
+        movq    mm0, mm3
+        movq    [rdi+rbx], mm2              ; store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
+        sub     rdi, rsi
+        pxor    mm1, mm1                    ; mm1 = zero
+        pcmpeqd mm2, mm1                    ; mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    ; mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    ; mm2 = 1s if either was zero
+        pandn   mm2, mm6                    ; mask delta with zeros check
+        pcmpeqd mm1, mm1
+        psubd   mm1, mm7
+        psubd   mm1, mm7
+        psubd   mm1, mm0
+        pxor    mm5, mm0
+        paddd   mm5, mm1
+        paddusw mm5, mm2                    ; and add to weight_AB
+        psubd   mm5, mm1
+        pxor    mm5, mm0
+        paddd   mm4, mm5                    ; add weights to sum
+        dec     ecx
+        jnz     term_minus_2_loop
+
+        mov     rax, [rsp]                  ; access dpp
+        movq    [rax+8], mm5                ; put weight_AB back
+        movq    [rax+88], mm4               ; put sum_AB back
+        emms
+
+        add     rdi, rsi                    ; back up a full sample
+        mov     edx, [rdi]                  ; dpp->samples_B [0] = iptr [-2];
+        mov     rax, [rsp]
+        mov     [rax+48], edx
+        jmp     done
+
+        align  64
+
+term_minus_3_loop:
+        movq    mm0, [rdi+rsi]              ; mm0 = previous calculated value
+        movq    mm3, mm0                    ; mm3 = swap dwords
+        psrlq   mm3, 32
+        punpckldq mm3, mm0                  ; mm3 = sam_AB
+
+        movq    mm1, mm3
+        pslld   mm1, 17
+        psrld   mm1, 17
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        pslld   mm0, 1
+        psrld   mm0, 16
+        pmaddwd mm0, mm5
+
+        movq    mm2, [rdi]                  ; mm2 = left_right
+        pslld   mm0, 5
+        paddd   mm1, mm7                    ; add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    ; add shifted sums
+        movq    mm0, mm3
+        movq    [rdi+rbx], mm2              ; store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
+        sub     rdi, rsi
+        pxor    mm1, mm1                    ; mm1 = zero
+        pcmpeqd mm2, mm1                    ; mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    ; mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    ; mm2 = 1s if either was zero
+        pandn   mm2, mm6                    ; mask delta with zeros check
+        pcmpeqd mm1, mm1
+        psubd   mm1, mm7
+        psubd   mm1, mm7
+        psubd   mm1, mm0
+        pxor    mm5, mm0
+        paddd   mm5, mm1
+        paddusw mm5, mm2                    ; and add to weight_AB
+        psubd   mm5, mm1
+        pxor    mm5, mm0
+        paddd   mm4, mm5                    ; add weights to sum
+        dec     ecx
+        jnz     term_minus_3_loop
+
+        mov     rax, [rsp]                  ; access dpp
+        movq    [rax+8], mm5                ; put weight_AB back
+        movq    [rax+88], mm4               ; put sum_AB back
+        emms
+
+        add     rdi, rsi                    ; back up a full sample
+        mov     edx, [rdi+4]                ; dpp->samples_A [0] = iptr [-1];
+        mov     rax, [rsp]
+        mov     [rax+16], edx
+        mov     edx, [rdi]                  ; dpp->samples_B [0] = iptr [-2];
+        mov     [rax+48], edx
+
+done:   add     rsp, 8                      ; begin epilog by deallocating stack
+        pop     rsi                         ; restore non-volatile registers & return
+        pop     rdi
+        pop     rbx
+        pop     rbp
+        ret
+
+pack_decorr_stereo_pass_cont_common endp
+
+; This is an assembly optimized version of the following WavPack function:
+;
+; uint32_t decorr_mono_buffer (int32_t *buffer,
+;                              struct decorr_pass *decorr_passes,
+;                              int32_t num_terms,
+;                              int32_t sample_count)
+;
+; Decorrelate a buffer of mono samples, in place, as specified by the array
+; of decorr_pass structures. Note that this function does NOT return the
+; dpp->samples_X[] values in the "normalized" positions for terms 1-8, so if
+; the number of samples is not a multiple of MAX_TERM, these must be moved if
+; they are to be used somewhere else. The magnitude of the output samples is
+; accumulated and returned (see scan_max_magnitude() for more details). By
+; using the overflow detection of the multiply instruction, this detects
+; when the "long_math" varient is required.
+;
+; For the fastest possible operation with the four "common" decorrelation
+; filters (i.e, fast, normal, high and very high) this function can be
+; configured to include hardcoded versions of these filters that are created
+; using macros. In that case, the passed filter is checked to make sure that
+; it matches one of the four. If it doesn't, or if the hardcoded flters are
+; not enabled, a "general" version of the decorrelation loop is used. This
+; variable enables the hardcoded filters and can be disabled if there are
+; problems with the code or macros:
+
+        HARDCODED_FILTERS = 1
+
+; This is written to work on an X86-64 processor (also called the AMD64)
+; running in 64-bit mode. This version is for the 64-bit Windows ABI and
+; provides appropriate prologs and epilogs for stack unwinding. The
+; arguments are passed in registers:
+;
+;   int32_t *buffer             rcx
+;   struct decorr_pass *dpp     rdx
+;   int32_t num_terms           r8
+;   int32_t sample_count        r9
+;
+; stack usage:
+;
+; [rsp+8] = sample_count
+; [rsp+0] = decorr_passes (unused in hardcoded filter case)
+;
+; register usage:
+;
+; ecx = sample being decorrelated
+; esi = sample up counter
+; rdi = *buffer
+; rbp = *dpp
+; r8 = magnitude accumulator
+; r9 = dpp end ptr (unused in hardcoded filter case)
+;
+        if     HARDCODED_FILTERS
+;
+; This macro is used for checking the decorr_passes array to make sure that the terms match
+; the hardcoded terms. The terms of these filters are the first element in the tables defined
+; in decorr_tables.h (with the negative terms replaced with 1).
+;
+
+chkterm macro   term, rbp_offset
+        cmp     BYTE PTR [rbp], term
+        jnz     use_general_version
+        add     rbp, rbp_offset
+        endm
+;
+; This macro processes the single specified term (with a fixed delta of 2) and updates the
+; term pointer (rbp) with the specified offset when done. It assumes the following registers:
+;
+; ecx = sample being decorrelated
+; esi = sample up counter (used for terms 1-8)
+; rbp = decorr_pass pointer for this term (updated with "rbp_offset" when done)
+; rax, rbx, rdx = scratch
+;
+
+exeterm macro   term, rbp_offset
+        local   over, cont, done
+
+        if      term le 8
+        mov     eax, esi
+        and     eax, 7
+        mov     ebx, [rbp+16+rax*4]
+        if      term ne 8
+        add     eax, term
+        and     eax, 7
+        endif
+        mov     [rbp+16+rax*4], ecx
+
+        elseif  term eq 17
+
+        mov     edx, [rbp+16]               ; handle term 17
+        mov     [rbp+16], ecx
+        lea     ebx, [rdx+rdx]
+        sub     ebx, [rbp+20]
+        mov     [rbp+20], edx
+
+        else
+
+        mov     edx, [rbp+16]               ; handle term 18
+        mov     [rbp+16], ecx
+        lea     ebx, [rdx+rdx*2]
+        sub     ebx, [rbp+20]
+        sar     ebx, 1
+        mov     [rbp+20], edx
+
+        endif
+
+        mov     eax, [rbp+8]
+        imul    eax, ebx                    ; 32-bit multiply is almost always enough
+        jo      over                        ; but handle overflow if it happens
+        sar     eax, 10
+        sbb     ecx, eax                    ; borrow flag provides rounding
+        jmp     cont
+over:   mov     eax, [rbp+8]                ; perform 64-bit multiply on overflow
+        imul    ebx
+        shr     eax, 10
+        sbb     ecx, eax
+        shl     edx, 22
+        sub     ecx, edx
+cont:   je      done
+        test    ebx, ebx
+        je      done
+        xor     ebx, ecx
+        sar     ebx, 30
+        or      ebx, 1                      ; this generates delta of 1
+        sal     ebx, 1                      ; this generates delta of 2
+        add     [rbp+8], ebx
+done:   add     rbp, rbp_offset
+
+        endm
+
+        endif                               ; end of macro definitions
+
+; entry points of function
+
+pack_decorr_mono_buffer_x64win proc public frame
+        push_reg    rbp                     ; save non-volatile registers on stack
+        push_reg    rbx                     ; (alphabetically)
+        push_reg    rdi
+        push_reg    rsi
+        alloc_stack 24                      ; allocate 24 bytes on stack & align to 16 bytes
+        end_prologue
+
+        mov     rdi, rcx                    ; copy params from win regs to Linux regs
+        mov     rsi, rdx                    ; so we can leave following code similar
+        mov     rdx, r8
+        mov     rcx, r9
+
+        mov     [rsp+8], rcx                ; [rsp+8] = sample count
+        mov     [rsp], rsi                  ; [rsp+0] = decorr_passes
+        xor     r8, r8                      ; r8 = max magnitude mask
+        xor     esi, esi                    ; up counter = 0
+
+        and     ecx, ecx                    ; test & handle zero sample count & zero term count
+        jz      mexit
+        and     edx, edx
+        jz      mexit
+
+        if     HARDCODED_FILTERS
+
+; first check to make sure all the "deltas" are 2
+
+        mov     rbp, [rsp]                  ; rbp is decorr_pass pointer
+        mov     ebx, edx                    ; get term count
+deltas: cmp     BYTE PTR [rbp+4], 2         ; make sure all the deltas are 2
+        jnz     use_general_version         ; if any aren't, use general case
+        add     rbp, 96
+        dec     ebx
+        jnz     deltas
+
+        mov     rbp, [rsp]                  ; rbp is decorr_pass pointer
+        cmp     dl, 2                       ; 2 terms is "fast"
+        jnz     nfast
+        chkterm 18,  96                     ; check "fast" terms
+        chkterm 17, -96
+        jmp     mono_fast_loop              ; if both terms match, go execute filter
+
+nfast:  cmp     dl, 5                       ; 5 terms is "normal"
+        jnz     nnorm
+        chkterm 18, 96                      ; check "normal" terms
+        chkterm 18, 96
+        chkterm 2,  96
+        chkterm 17, 96
+        chkterm 3,  96*-4
+        jmp     mono_normal_loop            ; if all terms match, go execute filter
+
+nnorm:  cmp     dl, 10                      ; 10 terms is "high"
+        jnz     nhigh
+        chkterm 18, 96                      ; check "high" terms
+        chkterm 18, 96
+        chkterm 18, 96
+        chkterm 1,  96
+        chkterm 2,  96
+        chkterm 3,  96
+        chkterm 5,  96
+        chkterm 1,  96
+        chkterm 17, 96
+        chkterm 4,  96*-9
+        jmp     mono_high_loop              ; if all terms match, go execute filter
+
+nhigh:  cmp     dl, 16                      ; 16 terms is "very high"
+        jnz     use_general_version         ; if none of these, use general version
+        chkterm 18, 96                      ; else check "very high" terms
+        chkterm 18, 96
+        chkterm 2,  96
+        chkterm 3,  96
+        chkterm 1,  96
+        chkterm 18, 96
+        chkterm 2,  96
+        chkterm 4,  96
+        chkterm 7,  96
+        chkterm 5,  96
+        chkterm 3,  96
+        chkterm 6,  96
+        chkterm 8,  96
+        chkterm 1,  96
+        chkterm 18, 96
+        chkterm 2,  96*-15
+        jmp     mono_vhigh_loop             ; if all terms match, go execute filter
+
+        align   64
+
+; hardcoded "fast" decorrelation loop
+
+mono_fast_loop:
+        mov     ecx, [rdi+rsi*4]             ; ecx is the sample we're decorrelating
+
+        exeterm 18,  96
+        exeterm 17, -96
+
+        mov     [rdi+rsi*4], ecx            ; store completed sample
+        mov     eax, ecx                    ; update magnitude mask
+        cdq
+        xor     eax, edx
+        or      r8, rax
+        inc     esi                         ; increment sample index
+        cmp     esi, [rsp+8]
+        jnz     mono_fast_loop              ; loop back for all samples
+        jmp     mexit                       ; then exit
+
+        align   64
+
+; hardcoded "normal" decorrelation loop
+
+mono_normal_loop:
+        mov     ecx, [rdi+rsi*4]             ; ecx is the sample we're decorrelating
+
+        exeterm 18, 96
+        exeterm 18, 96
+        exeterm 2,  96
+        exeterm 17, 96
+        exeterm 3,  96*-4
+
+        mov     [rdi+rsi*4], ecx            ; store completed sample
+        mov     eax, ecx                    ; update magnitude mask
+        cdq
+        xor     eax, edx
+        or      r8, rax
+        inc     esi                         ; increment sample index
+        cmp     esi, [rsp+8]
+        jnz     mono_normal_loop            ; loop back for all samples
+        jmp     mexit                       ; then exit
+
+        align   64
+
+; hardcoded "high" decorrelation loop
+
+mono_high_loop:
+        mov     ecx, [rdi+rsi*4]             ; ecx is the sample we're decorrelating
+
+        exeterm 18, 96
+        exeterm 18, 96
+        exeterm 18, 96
+        exeterm 1,  96
+        exeterm 2,  96
+        exeterm 3,  96
+        exeterm 5,  96
+        exeterm 1,  96
+        exeterm 17, 96
+        exeterm 4,  96*-9
+
+        mov     [rdi+rsi*4], ecx            ; store completed sample
+        mov     eax, ecx                    ; update magnitude mask
+        cdq
+        xor     eax, edx
+        or      r8, rax
+        inc     esi                         ; increment sample index
+        cmp     esi, [rsp+8]
+        jnz     mono_high_loop              ; loop back for all samples
+        jmp     mexit                       ; then exit
+
+        align   64
+
+; hardcoded "very high" decorrelation loop
+
+mono_vhigh_loop:
+        mov     ecx, [rdi+rsi*4]             ; ecx is the sample we're decorrelating
+
+        exeterm 18, 96
+        exeterm 18, 96
+        exeterm 2,  96
+        exeterm 3,  96
+        exeterm 1,  96
+        exeterm 18, 96
+        exeterm 2,  96
+        exeterm 4,  96
+        exeterm 7,  96
+        exeterm 5,  96
+        exeterm 3,  96
+        exeterm 6,  96
+        exeterm 8,  96
+        exeterm 1,  96
+        exeterm 18, 96
+        exeterm 2,  96*-15
+
+        mov     [rdi+rsi*4], ecx            ; store completed sample
+        mov     eax, ecx                    ; update magnitude mask
+        cdq
+        xor     eax, edx
+        or      r8, rax
+        inc     esi                         ; increment sample index
+        cmp     esi, [rsp+8]
+        jnz     mono_vhigh_loop             ; loop back for all samples
+        jmp     mexit                       ; then exit
+
+        endif                               ; end of hardcoded filters configuration
+
+; if none of the hardcoded filters are applicable, or we aren't using them, fall through to here
+
+use_general_version:
+        mov     rbp, [rsp]                   ; reload decorr_passes pointer to first term
+        imul    rax, rdx, 96
+        add     rax, rbp                     ; r9 = terminating decorr_pass pointer
+        mov     r9, rax
+        jmp     decorrelate_loop
+
+        align   64
+
+decorrelate_loop:
+        mov     ecx, [rdi+rsi*4]             ; ecx is the sample we're decorrelating
+nxterm: mov     edx, [rbp]
+        cmp     dl, 17
+        jge     @f
+
+        mov     eax, esi
+        and     eax, 7
+        mov     ebx, [rbp+16+rax*4]
+        add     eax, edx
+        and     eax, 7
+        mov     [rbp+16+rax*4], ecx
+        jmp     domult
+
+        align   4
+@@:     mov     edx, [rbp+16]
+        mov     [rbp+16], ecx
+        je      @f
+        lea     ebx, [rdx+rdx*2]
+        sub     ebx, [rbp+20]
+        sar     ebx, 1
+        mov     [rbp+20], edx
+        jmp     domult
+
+        align   4
+@@:     lea     ebx, [rdx+rdx]
+        sub     ebx, [rbp+20]
+        mov     [rbp+20], edx
+
+domult: mov     eax, [rbp+8]
+        mov     edx, eax
+        imul    eax, ebx
+        jo      multov                      ; on overflow, jump to use 64-bit imul varient
+        sar     eax, 10
+        sbb     ecx, eax
+        je      @f
+        test    ebx, ebx
+        je      @f
+        xor     ebx, ecx
+        sar     ebx, 31
+        xor     edx, ebx
+        add     edx, [rbp+4]
+        xor     edx, ebx
+        mov     [rbp+8], edx
+@@:     add     rbp, 96
+        cmp     rbp, r9
+        jnz     nxterm
+
+        mov     [rdi+rsi*4], ecx            ; store completed sample
+        mov     eax, ecx                    ; update magnitude mask
+        cdq
+        xor     eax, edx
+        or      r8, rax
+        mov     rbp, [rsp]                  ; reload decorr_passes pointer to first term
+        inc     esi                         ; increment sample index
+        cmp     esi, [rsp+8]
+        jnz     decorrelate_loop
+        jmp     mexit
+
+        align   4
+multov: mov     eax, [rbp+8]
+        imul    ebx
+        shr     eax, 10
+        sbb     ecx, eax
+        shl     edx, 22
+        sub     ecx, edx
+        je      @f
+        test    ebx, ebx
+        je      @f
+        xor     ebx, ecx
+        sar     ebx, 31
+        mov     eax, [rbp+8]
+        xor     eax, ebx
+        add     eax, [rbp+4]
+        xor     eax, ebx
+        mov     [rbp+8], eax
+@@:     add     rbp, 96
+        cmp     rbp, r9
+        jnz     nxterm
+
+        mov     [rdi+rsi*4], ecx            ; store completed sample
+        mov     eax, ecx                    ; update magnitude mask
+        cdq
+        xor     eax, edx
+        or      r8, rax
+        mov     rbp, [rsp]                  ; reload decorr_passes pointer to first term
+        inc     esi                         ; increment sample index
+        cmp     esi, [rsp+8]
+        jnz     decorrelate_loop            ; loop all the way back
+
+; common exit for entire function
+
+mexit:  mov     rax, r8                     ; return max magnitude
+        add     rsp, 24
+        pop     rsi
+        pop     rdi
+        pop     rbx
+        pop     rbp
+        ret
+
+pack_decorr_mono_buffer_x64win endp
+
+
+; This is an assembly optimized version of the following WavPack function:
+;
+; void decorr_mono_pass_cont (int32_t *out_buffer,
+;                             int32_t *in_buffer,
+;                             struct decorr_pass *dpp,
+;                             int32_t sample_count);
+;
+; It performs a single pass of mono decorrelation, transfering from the
+; input buffer to the output buffer. Note that this version of the function
+; requires that the up to 8 previous (depending on dpp->term) mono samples
+; are visible and correct. In other words, it ignores the "samples_*"
+; fields in the decorr_pass structure and gets the history data directly
+; from the source buffer. It does, however, return the appropriate history
+; samples to the decorr_pass structure before returning.
+;
+; By using the overflow detection of the multiply instruction, it detects
+; when the "long_math" varient is required and automatically does it.
+;
+; This is written to work on an X86-64 processor (also called the AMD64)
+; running in 64-bit mode. This version is for the 64-bit Windows ABI and
+; provides appropriate prologs and epilogs for stack unwinding. The
+; arguments are passed in registers:
+;
+;   int32_t *out_buffer         rcx
+;   int32_t *in_buffer          rdx
+;   struct decorr_pass *dpp     r8
+;   int32_t sample_count        r9
+;
+; Stack usage:
+;
+; [rsp+0] = *dpp
+;
+; Register usage:
+;
+; rsi = source ptr
+; rdi = destination ptr
+; rcx = term * -4 (default terms)
+; rcx = previous sample (terms 17 & 18)
+; ebp = weight
+; r8d = delta
+; r9d = weight sum
+; r10 = eptr
+;
+
+pack_decorr_mono_pass_cont_x64win proc public frame
+        push_reg    rbp
+        push_reg    rbx
+        push_reg    rdi
+        push_reg    rsi
+        alloc_stack 8                       ; allocate 8 bytes on stack & align to 16 bytes
+        end_prologue
+
+        mov     rdi, rcx                    ; copy params from win regs to Linux regs
+        mov     rsi, rdx                    ; so we can leave following code similar
+        mov     rdx, r8
+        mov     rcx, r9
+
+        mov     [rsp], rdx
+        and     ecx, ecx                    ; test & handle zero sample count
+        jz      mono_done
+
+        cld
+        mov     r8d, [rdx+4]                ; rd8 = delta
+        mov     ebp, [rdx+8]                ; ebp = weight
+        mov     r9d, [rdx+88]               ; r9d = weight sum
+        lea     r10, [rsi+rcx*4]            ; r10 = eptr
+        mov     ecx, [rsi-4]                ; preload last sample
+        mov     eax, [rdx]                  ; get term
+        cmp     al, 17
+        je      mono_term_17_loop
+        cmp     al, 18
+        je      mono_term_18_loop
+
+        imul    rcx, rax, -4                ; rcx is index to correlation sample
+        jmp     mono_default_term_loop
+
+        align  64
+
+mono_default_term_loop:
+        mov     edx, [rsi+rcx]
+        mov     ebx, edx
+        imul    edx, ebp
+        jo      over
+        lodsd
+        sar     edx, 10
+        sbb     eax, edx
+        jmp     @f
+over:   mov     eax, ebx
+        imul    ebp
+        shl     edx, 22
+        shr     eax, 10
+        adc     edx, eax                    ; edx = apply_weight (sam_A)
+        lodsd
+        sub     eax, edx
+@@:     stosd
+        je      @f
+        test    ebx, ebx
+        je      @f
+        xor     eax, ebx
+        cdq
+        xor     ebp, edx
+        add     ebp, r8d
+        xor     ebp, edx
+@@:     add     r9d, ebp
+        cmp     rsi, r10
+        jnz     mono_default_term_loop
+
+        mov     rdx, [rsp]                  ; rdx = *dpp
+        mov     [rdx+8], ebp                ; put weight back
+        mov     [rdx+88], r9d               ; put weight sum back
+        movsxd  rcx, DWORD PTR [rdx]        ; rcx = dpp->term
+
+mono_default_store_samples:
+        dec     rcx
+        sub     rsi, 4                      ; back up one sample
+        mov     eax, [rsi]
+        mov     [rdx+rcx*4+16], eax         ; store samples_A [ecx]
+        test    rcx, rcx
+        jnz     mono_default_store_samples
+        jmp     mono_done
+
+        align  64
+
+mono_term_17_loop:
+        lea     edx, [rcx+rcx]
+        sub     edx, [rsi-8]                ; ebx = sam_A
+        mov     ebx, edx
+        imul    edx, ebp
+        jo      over17
+        sar     edx, 10
+        lodsd
+        mov     ecx, eax
+        sbb     eax, edx
+        jmp     @f
+over17: mov     eax, ebx
+        imul    ebp
+        shl     edx, 22
+        shr     eax, 10
+        adc     edx, eax                    ; edx = apply_weight (sam_A)
+        lodsd
+        mov     ecx, eax
+        sub     eax, edx
+@@:     stosd
+        je      @f
+        test    ebx, ebx
+        je      @f
+        xor     eax, ebx
+        cdq
+        xor     ebp, edx
+        add     ebp, r8d
+        xor     ebp, edx
+@@:     add     r9d, ebp
+        cmp     rsi, r10
+        jnz     mono_term_17_loop
+        jmp     mono_term_1718_exit
+
+        align  64
+
+mono_term_18_loop:
+        lea     edx, [rcx+rcx*2]
+        sub     edx, [rsi-8]
+        sar     edx, 1
+        mov     ebx, edx                    ; ebx = sam_A
+        imul    edx, ebp
+        jo      over18
+        sar     edx, 10
+        lodsd
+        mov     ecx, eax
+        sbb     eax, edx
+        jmp     @f
+over18: mov     eax, ebx
+        imul    ebp
+        shl     edx, 22
+        shr     eax, 10
+        adc     edx, eax                    ; edx = apply_weight (sam_A)
+        lodsd
+        mov     ecx, eax
+        sub     eax, edx
+@@:     stosd
+        je      @f
+        test    ebx, ebx
+        je      @f
+        xor     eax, ebx
+        cdq
+        xor     ebp, edx
+        add     ebp, r8d
+        xor     ebp, edx
+@@:     add     r9d, ebp
+        cmp     rsi, r10
+        jnz     mono_term_18_loop
+
+mono_term_1718_exit:
+        mov     rdx, [rsp]                  ; rdx = *dpp
+        mov     [rdx+8], ebp                ; put weight back
+        mov     [rdx+88], r9d               ; put weight sum back
+        mov     eax, [rsi-4]                ; dpp->samples_A [0] = bptr [-1]
+        mov     [rdx+16], eax
+        mov     eax, [rsi-8]                ; dpp->samples_A [1] = bptr [-2]
+        mov     [rdx+20], eax
+
+mono_done:
+        add     rsp, 8
+        pop     rsi
+        pop     rdi
+        pop     rbx
+        pop     rbp
+        ret
+
+pack_decorr_mono_pass_cont_x64win endp
+
+
+; This is an assembly optimized version of the following WavPack function:
+;
+; uint32_t scan_max_magnitude (int32_t *buffer, int32_t sample_count);
+;
+; This function scans a buffer of signed 32-bit ints and returns the magnitude
+; of the largest sample, with a power-of-two resolution. It might be more
+; useful to return the actual maximum absolute value, but that implementation
+; would be slower. Instead, this simply returns the "or" of all the values
+; "xor"d with their own sign, like so:
+;
+;     while (sample_count--)
+;         magnitude |= (*buffer < 0) ? ~*buffer++ : *buffer++;
+;
+; This is written to work on an X86-64 processor (also called the AMD64)
+; running in 64-bit mode and uses the MMX extensions to improve the
+; performance by processing two samples together.
+;
+; This is written to work on an X86-64 processor (also called the AMD64)
+; running in 64-bit mode. This version is for the 64-bit Windows ABI and
+; provides appropriate prologs and epilogs for stack unwinding. The
+; arguments are passed in registers:
+;
+;   int32_t *buffer             rcx
+;   int32_t sample_count        rdx
+;
+; During the processing loops, the following registers are used:
+;
+;   rdi         buffer pointer
+;   rsi         termination buffer pointer
+;   ebx         single magnitude accumulator
+;   mm0         dual magnitude accumulator
+;   mm1, mm2    scratch
+;
+
+scan_max_magnitude_x64win proc public frame
+        push_reg    rbp                     ; save non-volatile registers on stack
+        push_reg    rbx                     ; (alphabetically)
+        push_reg    rdi
+        push_reg    rsi
+        alloc_stack 8                       ; allocate 8 bytes on stack & align to 16 bytes
+        end_prologue
+
+        mov     rdi, rcx                    ; copy params from win regs to Linux regs
+        mov     rsi, rdx                    ; so we can leave following code similar
+
+        xor     ebx, ebx                    ; clear magnitude accumulator
+
+        mov     eax, esi                    ; eax = count
+        and     eax, 7
+        mov     ecx, eax                    ; ecx = leftover samples to "manually" scan at end
+
+        shr     esi, 3                      ; esi = num of loops to process mmx (8 samples/loop)
+        shl     esi, 5                      ; esi = num of bytes to process mmx (32 bytes/loop)
+        jz      nommx                       ; jump around if no mmx loops to do (< 8 samples)
+
+        pxor    mm0, mm0                    ; clear dual magnitude accumulator
+        add     rsi, rdi                    ; rsi = termination buffer pointer for mmx loop
+        jmp     mmxlp
+
+        align   64
+
+mmxlp:  movq    mm1, [rdi]                  ; get stereo samples in mm1 & mm2
+        movq    mm2, mm1
+        psrad   mm1, 31                     ; mm1 = sign (mm2)
+        pxor    mm1, mm2                    ; mm1 = absolute magnitude, or into result
+        por     mm0, mm1
+
+        movq    mm1, [rdi+8]                ; do it again with 6 more samples
+        movq    mm2, mm1
+        psrad   mm1, 31
+        pxor    mm1, mm2
+        por     mm0, mm1
+
+        movq    mm1, [rdi+16]
+        movq    mm2, mm1
+        psrad   mm1, 31
+        pxor    mm1, mm2
+        por     mm0, mm1
+
+        movq    mm1, [rdi+24]
+        movq    mm2, mm1
+        psrad   mm1, 31
+        pxor    mm1, mm2
+        por     mm0, mm1
+
+        add     rdi, 32
+        cmp     rdi, rsi
+        jnz     mmxlp
+
+        movd    eax, mm0                    ; ebx = "or" of high and low mm0
+        punpckhdq mm0, mm0
+        movd    ebx, mm0
+        or      ebx, eax
+        emms
+
+nommx:  and     ecx, ecx                    ; any leftover samples to do?
+        jz      noleft
+
+leftlp: mov     eax, [rdi]
+        cdq
+        xor     eax, edx
+        or      ebx, eax
+        add     rdi, 4
+        loop    leftlp
+
+noleft: mov     eax, ebx                    ; move magnitude to eax for return
+        add     rsp, 8
+        pop     rsi
+        pop     rdi
+        pop     rbx
+        pop     rbp
+        ret
+
+scan_max_magnitude_x64win endp
+
+
+; This is an assembly optimized version of the following WavPack function:
+;
+; uint32_t log2buffer (int32_t *samples, uint32_t num_samples, int limit);
+;
+; This function scans a buffer of 32-bit ints and accumulates the total
+; log2 value of all the samples. This is useful for determining maximum
+; compression because the bitstream storage required for entropy coding
+; is proportional to the base 2 log of the samples.
+;
+; This is written to work on an X86-64 processor (also called the AMD64)
+; running in 64-bit mode. This version is for the 64-bit Windows ABI and
+; provides appropriate prologs and epilogs for stack unwinding. The
+; arguments are passed in registers:
+;
+;   int32_t *samples            rcx
+;   uint32_t num_samples        rdx
+;   int limit                   r8
+;
+; During the processing loops, the following registers are used:
+;
+;   r8              pointer to the 256-byte log fraction table
+;   rsi             input buffer pointer
+;   edi             sum accumulator
+;   ebx             sample count
+;   ebp             limit (if specified non-zero)
+;   eax,ecx,edx     scratch
+;
+
+        align  256
+
+        .radix 16
+
+log2_table:
+        byte   000, 001, 003, 004, 006, 007, 009, 00a, 00b, 00d, 00e, 010, 011, 012, 014, 015
+        byte   016, 018, 019, 01a, 01c, 01d, 01e, 020, 021, 022, 024, 025, 026, 028, 029, 02a
+        byte   02c, 02d, 02e, 02f, 031, 032, 033, 034, 036, 037, 038, 039, 03b, 03c, 03d, 03e
+        byte   03f, 041, 042, 043, 044, 045, 047, 048, 049, 04a, 04b, 04d, 04e, 04f, 050, 051
+        byte   052, 054, 055, 056, 057, 058, 059, 05a, 05c, 05d, 05e, 05f, 060, 061, 062, 063
+        byte   064, 066, 067, 068, 069, 06a, 06b, 06c, 06d, 06e, 06f, 070, 071, 072, 074, 075
+        byte   076, 077, 078, 079, 07a, 07b, 07c, 07d, 07e, 07f, 080, 081, 082, 083, 084, 085
+        byte   086, 087, 088, 089, 08a, 08b, 08c, 08d, 08e, 08f, 090, 091, 092, 093, 094, 095
+        byte   096, 097, 098, 099, 09a, 09b, 09b, 09c, 09d, 09e, 09f, 0a0, 0a1, 0a2, 0a3, 0a4
+        byte   0a5, 0a6, 0a7, 0a8, 0a9, 0a9, 0aa, 0ab, 0ac, 0ad, 0ae, 0af, 0b0, 0b1, 0b2, 0b2
+        byte   0b3, 0b4, 0b5, 0b6, 0b7, 0b8, 0b9, 0b9, 0ba, 0bb, 0bc, 0bd, 0be, 0bf, 0c0, 0c0
+        byte   0c1, 0c2, 0c3, 0c4, 0c5, 0c6, 0c6, 0c7, 0c8, 0c9, 0ca, 0cb, 0cb, 0cc, 0cd, 0ce
+        byte   0cf, 0d0, 0d0, 0d1, 0d2, 0d3, 0d4, 0d4, 0d5, 0d6, 0d7, 0d8, 0d8, 0d9, 0da, 0db
+        byte   0dc, 0dc, 0dd, 0de, 0df, 0e0, 0e0, 0e1, 0e2, 0e3, 0e4, 0e4, 0e5, 0e6, 0e7, 0e7
+        byte   0e8, 0e9, 0ea, 0ea, 0eb, 0ec, 0ed, 0ee, 0ee, 0ef, 0f0, 0f1, 0f1, 0f2, 0f3, 0f4
+        byte   0f4, 0f5, 0f6, 0f7, 0f7, 0f8, 0f9, 0f9, 0fa, 0fb, 0fc, 0fc, 0fd, 0fe, 0ff, 0ff
+
+        .radix  10
+
+log2buffer_x64win proc public frame
+        push_reg    rbp                     ; save non-volatile registers on stack
+        push_reg    rbx                     ; (alphabetically)
+        push_reg    rdi
+        push_reg    rsi
+        alloc_stack 8                       ; allocate 8 bytes on stack & align to 16 bytes
+        end_prologue
+
+        mov     rdi, rcx                    ; copy params from win regs to Linux regs
+        mov     rsi, rdx                    ; so we can leave following code similar
+        mov     rdx, r8
+
+        mov     ebx, esi                    ; ebx = num_samples
+        mov     rsi, rdi                    ; rsi = *samples
+        xor     edi, edi                    ; initialize sum
+        lea     r8, log2_table
+        test    ebx, ebx                    ; test count for zero
+        jz      normal_exit
+        mov     ebp, edx                    ; ebp = limit
+        test    ebp, ebp                    ; we have separate loops for limit and no limit
+        jz      no_limit_loop
+        jmp     limit_loop
+
+        align  64
+
+limit_loop:
+        mov     eax, [rsi]                  ; get next sample into eax
+        cdq                                 ; edx = sign of sample (for abs)
+        add     rsi, 4
+        xor     eax, edx
+        sub     eax, edx
+        je      L40                         ; skip if sample was zero
+        mov     edx, eax                    ; move to edx and apply rounding
+        shr     eax, 9
+        add     edx, eax
+        bsr     ecx, edx                    ; ecx = MSB set in sample (0 - 31)
+        lea     eax, [ecx+1]                ; eax = number used bits in sample (1 - 32)
+        sub     ecx, 8                      ; ecx = shift right amount (-8 to 23)
+        ror     edx, cl                     ; use rotate to do "signed" shift 
+        sal     eax, 8                      ; move nbits to integer portion of log
+        movzx   edx, dl                     ; dl = mantissa, look up log fraction in table 
+        mov     al, [r8+rdx]                ; eax = combined integer and fraction for full log
+        add     edi, eax                    ; add to running sum and compare to limit
+        cmp     eax, ebp
+        jge     limit_exceeded
+L40:    sub     ebx, 1                      ; loop back if more samples
+        jne     limit_loop
+        jmp     normal_exit
+
+        align  64
+
+no_limit_loop:
+        mov     eax, [rsi]                  ; get next sample into eax
+        cdq                                 ; edx = sign of sample (for abs)
+        add     rsi, 4
+        xor     eax, edx
+        sub     eax, edx
+        je      L45                         ; skip if sample was zero
+        mov     edx, eax                    ; move to edx and apply rounding
+        shr     eax, 9
+        add     edx, eax
+        bsr     ecx, edx                    ; ecx = MSB set in sample (0 - 31)
+        lea     eax, [ecx+1]                ; eax = number used bits in sample (1 - 32)
+        sub     ecx, 8                      ; ecx = shift right amount (-8 to 23)
+        ror     edx, cl                     ; use rotate to do "signed" shift 
+        sal     eax, 8                      ; move nbits to integer portion of log
+        movzx   edx, dl                     ; dl = mantissa, look up log fraction in table 
+        mov     al, [r8+rdx]                ; eax = combined integer and fraction for full log
+        add     edi, eax                    ; add to running sum
+L45:    sub     ebx, 1
+        jne     no_limit_loop
+        jmp     normal_exit
+
+limit_exceeded:
+        mov     edi, -1                     ; return -1 to indicate limit hit
+normal_exit:
+        mov     eax, edi                    ; move sum accumulator into eax for return
+
+        add     rsp, 8                      ; begin epilog by deallocating stack
+        pop     rsi                         ; restore non-volatile registers & return
+        pop     rdi
+        pop     rbx
+        pop     rbp
+        ret
+
+log2buffer_x64win endp
+
+asmcode ends
+
+        end
+
diff --git a/third_party/wavpack/src/pack_x86.S b/third_party/wavpack/src/pack_x86.S
new file mode 100644
index 0000000..31cf7a4
--- /dev/null
+++ b/third_party/wavpack/src/pack_x86.S
@@ -0,0 +1,1840 @@
+############################################################################
+##                           **** WAVPACK ****                            ##
+##                  Hybrid Lossless Wavefile Compressor                   ##
+##              Copyright (c) 1998 - 2015 Conifer Software.               ##
+##                          All Rights Reserved.                          ##
+##      Distributed under the BSD Software License (see license.txt)      ##
+############################################################################
+
+        .intel_syntax noprefix
+        .text
+
+        .globl  _pack_decorr_stereo_pass_x86
+        .globl  _pack_decorr_stereo_pass_cont_rev_x86
+        .globl  _pack_decorr_stereo_pass_cont_x86
+        .globl  _pack_decorr_mono_buffer_x86
+        .globl  _pack_decorr_mono_pass_cont_x86
+        .globl  _pack_cpu_has_feature_x86
+        .globl  _scan_max_magnitude_x86
+        .globl  _log2buffer_x86
+
+        .globl  pack_decorr_stereo_pass_x86
+        .globl  pack_decorr_stereo_pass_cont_rev_x86
+        .globl  pack_decorr_stereo_pass_cont_x86
+        .globl  pack_decorr_mono_buffer_x86
+        .globl  pack_decorr_mono_pass_cont_x86
+        .globl  pack_cpu_has_feature_x86
+        .globl  scan_max_magnitude_x86
+        .globl  log2buffer_x86
+
+# This module contains X86 assembly optimized versions of functions required
+# to encode WavPack files. Note that the stereo versions of these functions
+# use the MMX registers and instructions of the X86 processor, and so a
+# helper function is provided to make a runtime check for that feature.
+
+# This is an assembly optimized version of the following WavPack function:
+#
+# void pack_decorr_stereo_pass (
+#   struct decorr_pass *dpp,
+#   int32_t *buffer,
+#   int32_t sample_count);
+#
+# It performs a single pass of stereo decorrelation, in place, as specified
+# by the decorr_pass structure. Note that this function does NOT return the
+# dpp->samples_X[] values in the "normalized" positions for terms 1-8, so if
+# the number of samples is not a multiple of MAX_TERM, these must be moved if
+# they are to be used somewhere else.
+#
+# This is written to work on an IA-32 processor and uses the MMX extensions
+# to improve the performance by processing both stereo channels together.
+# It is based on the original MMX code written by Joachim Henke that used
+# MMX intrinsics called from C. Many thanks to Joachim for that!
+#
+# An issue with using MMX for this is that the sample history array in the
+# decorr_pass structure contains separate arrays for each channel while the
+# MMX code wants there to be a single array of dual samples. The fix for
+# this is to convert the data in the arrays on entry and exit, and this is
+# made easy by the fact that the 8 MMX regsiters hold exactly the required
+# amount of data (64 bytes)!
+#
+# This is written to work on an IA-32 processor. The arguments are on the
+# stack at these locations (after 4 pushes, we do not use ebp as a base
+# pointer):
+#
+#   struct decorr_pass *dpp   [esp+20]
+#   int32_t *buffer           [esp+24]
+#   int32_t sample_count      [esp+28]
+#
+# During the processing loops, the following registers are used:
+#
+#   edi         buffer pointer
+#   esi         termination buffer pointer
+#   eax,ebx,edx used in default term to reduce calculation         
+#   ebp         decorr_pass pointer
+#   mm0, mm1    scratch
+#   mm2         original sample values
+#   mm3         correlation samples
+#   mm4         0 (for pcmpeqd)
+#   mm5         weights
+#   mm6         delta
+#   mm7         512 (for rounding)
+#
+
+_pack_decorr_stereo_pass_x86:
+pack_decorr_stereo_pass_x86:
+        push    ebp
+        push    ebx
+        push    edi
+        push    esi
+
+        mov     ebp, [esp+20]               # ebp = *dpp
+        mov     edi, [esp+24]               # edi = buffer
+        mov     esi, [esp+28]
+        shl     esi, 3
+        jz      bdone
+        add     esi, edi                    # esi = termination buffer pointer
+
+        // convert samples_A and samples_B array into samples_AB array for MMX
+        // (the MMX registers provide exactly enough storage to do this easily)
+
+        movq        mm0, [ebp+16]
+        punpckldq   mm0, [ebp+48]
+        movq        mm1, [ebp+16]
+        punpckhdq   mm1, [ebp+48]
+        movq        mm2, [ebp+24]
+        punpckldq   mm2, [ebp+56]
+        movq        mm3, [ebp+24]
+        punpckhdq   mm3, [ebp+56]
+        movq        mm4, [ebp+32]
+        punpckldq   mm4, [ebp+64]
+        movq        mm5, [ebp+32]
+        punpckhdq   mm5, [ebp+64]
+        movq        mm6, [ebp+40]
+        punpckldq   mm6, [ebp+72]
+        movq        mm7, [ebp+40]
+        punpckhdq   mm7, [ebp+72]
+
+        movq    [ebp+16], mm0
+        movq    [ebp+24], mm1
+        movq    [ebp+32], mm2
+        movq    [ebp+40], mm3
+        movq    [ebp+48], mm4
+        movq    [ebp+56], mm5
+        movq    [ebp+64], mm6
+        movq    [ebp+72], mm7
+
+        mov     eax, 512
+        movd    mm7, eax
+        punpckldq mm7, mm7                  # mm7 = round (512)
+
+        mov     eax, [ebp+4]
+        movd    mm6, eax
+        punpckldq mm6, mm6                  # mm6 = delta (0-7)
+
+        mov     eax, 0xFFFF                 # mask high weights to zero for PMADDWD
+        movd    mm5, eax
+        punpckldq mm5, mm5                  # mm5 = weight mask 0x0000FFFF0000FFFF
+        pand    mm5, [ebp+8]                # mm5 = weight_AB masked to 16-bit
+
+        movq    mm4, [ebp+16]               # preload samples_AB[0]
+
+        mov     al, [ebp]                   # get term and vector to correct loop
+        cmp     al, 17
+        je      buff_term_17_loop
+        cmp     al, 18
+        je      buff_term_18_loop
+        cmp     al, -1
+        je      buff_term_minus_1_loop
+        cmp     al, -2
+        je      buff_term_minus_2_loop
+        cmp     al, -3
+        je      buff_term_minus_3_loop
+
+        pxor    mm4, mm4                    # mm4 = 0 (for pcmpeqd)
+        xor     eax, eax
+        xor     ebx, ebx
+        add     bl, [ebp]
+        mov     ecx, 7
+        and     ebx, ecx
+        jmp     buff_default_term_loop
+
+        .balign  64
+
+buff_default_term_loop:
+        movq    mm2, [edi]                  # mm2 = left_right
+        movq    mm3, [ebp+16+eax*8]
+        inc     eax
+        and     eax, ecx
+        movq    [ebp+16+ebx*8], mm2
+        inc     ebx
+        and     ebx, ecx
+
+        movq    mm1, mm3
+        paddd   mm1, mm1
+        psrlw   mm1, 1
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        psrld   mm0, 15
+        pmaddwd mm0, mm5
+
+        pslld   mm0, 5
+        paddd   mm1, mm7                    # add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    # add shifted sums
+        movq    mm0, mm3
+        movq    [edi], mm2                  # store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
+        add     edi, 8
+        pcmpeqd mm2, mm4                    # mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm4                    # mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    # mm2 = 1s if either was zero
+        pandn   mm2, mm6                    # mask delta with zeros check
+        pxor    mm5, mm0
+        paddw   mm5, mm2                    # and add to weight_AB
+        pxor    mm5, mm0
+        cmp     edi, esi
+        jnz     buff_default_term_loop
+
+        jmp     bdone
+
+        .balign  64
+
+buff_term_17_loop:
+        movq    mm3, mm4                    # get previous calculated value
+        paddd   mm3, mm4
+        psubd   mm3, [ebp+24]
+        movq    [ebp+24], mm4
+
+        movq    mm1, mm3
+        paddd   mm1, mm1
+        psrlw   mm1, 1
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        psrld   mm0, 15
+        pmaddwd mm0, mm5
+
+        movq    mm2, [edi]                  # mm2 = left_right
+        movq    mm4, mm2
+        pslld   mm0, 5
+        paddd   mm1, mm7                    # add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    # add shifted sums
+        movq    mm0, mm3
+        movq    [edi], mm2                  # store result
+        pxor    mm1, mm1
+        pxor    mm0, mm2
+        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
+        add     edi, 8
+        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    # mm2 = 1s if either was zero
+        pandn   mm2, mm6                    # mask delta with zeros check
+        pxor    mm5, mm0
+        paddw   mm5, mm2                    # and add to weight_AB
+        pxor    mm5, mm0
+        cmp     edi, esi
+        jnz     buff_term_17_loop
+
+        movq    [ebp+16], mm4               # post-store samples_AB[0]
+        jmp     bdone
+
+        .balign  64
+
+buff_term_18_loop:
+        movq    mm3, mm4                    # get previous calculated value
+        psubd   mm3, [ebp+24]
+        psrad   mm3, 1
+        paddd   mm3, mm4                    # mm3 = sam_AB
+        movq    [ebp+24], mm4
+
+        movq    mm1, mm3
+        paddd   mm1, mm1
+        psrlw   mm1, 1
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        psrld   mm0, 15
+        pmaddwd mm0, mm5
+
+        movq    mm2, [edi]                  # mm2 = left_right
+        movq    mm4, mm2
+        pslld   mm0, 5
+        paddd   mm1, mm7                    # add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    # add shifted sums
+        movq    mm0, mm3
+        movq    [edi], mm2                  # store result
+        pxor    mm1, mm1
+        pxor    mm0, mm2
+        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
+        add     edi, 8
+        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    # mm2 = 1s if either was zero
+        pandn   mm2, mm6                    # mask delta with zeros check
+        pxor    mm5, mm0
+        paddw   mm5, mm2                    # and add to weight_AB
+        pxor    mm5, mm0
+        cmp     edi, esi
+        jnz     buff_term_18_loop
+
+        movq    [ebp+16], mm4               # post-store samples_AB[0]
+        jmp     bdone
+
+        .balign  64
+
+buff_term_minus_1_loop:
+        movq    mm3, mm4                    # mm3 = previous calculated value
+        movq    mm2, [edi]                  # mm2 = left_right
+        movq    mm4, mm2
+        psrlq   mm4, 32
+        punpckldq mm3, mm2                  # mm3 = sam_AB
+
+        movq    mm1, mm3
+        paddd   mm1, mm1
+        psrlw   mm1, 1
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        psrld   mm0, 15
+        pmaddwd mm0, mm5
+
+        pslld   mm0, 5
+        paddd   mm1, mm7                    # add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    # add shifted sums
+        movq    mm0, mm3
+        movq    [edi], mm2                  # store result
+        pxor    mm1, mm1
+        pxor    mm0, mm2
+        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
+        add     edi, 8
+        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    # mm2 = 1s if either was zero
+        pandn   mm2, mm6                    # mask delta with zeros check
+        pcmpeqd mm1, mm1
+        psubd   mm1, mm7
+        psubd   mm1, mm7
+        psubd   mm1, mm0
+        pxor    mm5, mm0
+        paddw   mm5, mm1
+        paddusw mm5, mm2                    # and add to weight_AB
+        psubw   mm5, mm1
+        pxor    mm5, mm0
+        cmp     edi, esi
+        jnz     buff_term_minus_1_loop
+
+        movq    [ebp+16], mm4               # post-store samples_AB[0]
+        jmp     bdone
+
+        .balign  64
+
+buff_term_minus_2_loop:
+        movq    mm2, [edi]                  # mm2 = left_right
+        movq    mm3, mm2
+        psrlq   mm3, 32
+        por     mm3, mm4
+        punpckldq mm4, mm2
+
+        movq    mm1, mm3
+        paddd   mm1, mm1
+        psrlw   mm1, 1
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        psrld   mm0, 15
+        pmaddwd mm0, mm5
+
+        pslld   mm0, 5
+        paddd   mm1, mm7                    # add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    # add shifted sums
+        movq    mm0, mm3
+        movq    [edi], mm2                  # store result
+        pxor    mm1, mm1
+        pxor    mm0, mm2
+        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
+        add     edi, 8
+        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    # mm2 = 1s if either was zero
+        pandn   mm2, mm6                    # mask delta with zeros check
+        pcmpeqd mm1, mm1
+        psubd   mm1, mm7
+        psubd   mm1, mm7
+        psubd   mm1, mm0
+        pxor    mm5, mm0
+        paddw   mm5, mm1
+        paddusw mm5, mm2                    # and add to weight_AB
+        psubw   mm5, mm1
+        pxor    mm5, mm0
+        cmp     edi, esi
+        jnz     buff_term_minus_2_loop
+
+        movq    [ebp+16], mm4               # post-store samples_AB[0]
+        jmp     bdone
+
+        .balign  64
+
+buff_term_minus_3_loop:
+        movq    mm2, [edi]                  # mm2 = left_right
+        movq    mm3, mm4                    # mm3 = previous calculated value
+        movq    mm4, mm2                    # mm0 = swap dwords of new data
+        psrlq   mm4, 32
+        punpckldq mm4, mm2                  # mm3 = sam_AB
+
+        movq    mm1, mm3
+        paddd   mm1, mm1
+        psrlw   mm1, 1
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        psrld   mm0, 15
+        pmaddwd mm0, mm5
+
+        pslld   mm0, 5
+        paddd   mm1, mm7                    # add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    # add shifted sums
+        movq    mm0, mm3
+        movq    [edi], mm2                  # store result
+        pxor    mm1, mm1
+        pxor    mm0, mm2
+        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
+        add     edi, 8
+        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    # mm2 = 1s if either was zero
+        pandn   mm2, mm6                    # mask delta with zeros check
+        pcmpeqd mm1, mm1
+        psubd   mm1, mm7
+        psubd   mm1, mm7
+        psubd   mm1, mm0
+        pxor    mm5, mm0
+        paddw   mm5, mm1
+        paddusw mm5, mm2                    # and add to weight_AB
+        psubw   mm5, mm1
+        pxor    mm5, mm0
+        cmp     edi, esi
+        jnz     buff_term_minus_3_loop
+
+        movq    [ebp+16], mm4               # post-store samples_AB[0]
+
+bdone:  pslld   mm5, 16                     # sign-extend 16-bit weights back to dwords
+        psrad   mm5, 16
+        movq    [ebp+8], mm5                # put weight_AB back
+
+        // convert samples_AB array back into samples_A and samples_B
+
+        movq    mm0, [ebp+16]
+        movq    mm1, [ebp+24]
+        movq    mm2, [ebp+32]
+        movq    mm3, [ebp+40]
+        movq    mm4, [ebp+48]
+        movq    mm5, [ebp+56]
+        movq    mm6, [ebp+64]
+        movq    mm7, [ebp+72]
+
+        movd    [ebp+16], mm0
+        movd    [ebp+20], mm1
+        movd    [ebp+24], mm2
+        movd    [ebp+28], mm3
+        movd    [ebp+32], mm4
+        movd    [ebp+36], mm5
+        movd    [ebp+40], mm6
+        movd    [ebp+44], mm7
+
+        punpckhdq   mm0, mm0
+        punpckhdq   mm1, mm1
+        punpckhdq   mm2, mm2
+        punpckhdq   mm3, mm3
+        punpckhdq   mm4, mm4
+        punpckhdq   mm5, mm5
+        punpckhdq   mm6, mm6
+        punpckhdq   mm7, mm7
+
+        movd    [ebp+48], mm0
+        movd    [ebp+52], mm1
+        movd    [ebp+56], mm2
+        movd    [ebp+60], mm3
+        movd    [ebp+64], mm4
+        movd    [ebp+68], mm5
+        movd    [ebp+72], mm6
+        movd    [ebp+76], mm7
+
+        emms
+
+        pop     esi
+        pop     edi
+        pop     ebx
+        pop     ebp
+        ret
+
+# These are assembly optimized version of the following WavPack functions:
+#
+# void pack_decorr_stereo_pass_cont (
+#   struct decorr_pass *dpp,
+#   int32_t *in_buffer,
+#   int32_t *out_buffer,
+#   int32_t sample_count);
+#
+# void pack_decorr_stereo_pass_cont_rev (
+#   struct decorr_pass *dpp,
+#   int32_t *in_buffer,
+#   int32_t *out_buffer,
+#   int32_t sample_count);
+#
+# It performs a single pass of stereo decorrelation, transfering from the
+# input buffer to the output buffer. Note that this version of the function
+# requires that the up to 8 previous (depending on dpp->term) stereo samples
+# are visible and correct. In other words, it ignores the "samples_*"
+# fields in the decorr_pass structure and gets the history data directly
+# from the source buffer. It does, however, return the appropriate history
+# samples to the decorr_pass structure before returning.
+#
+# This is written to work on an IA-32 processor and uses the MMX extensions
+# to improve the performance by processing both stereo channels together.
+# It is based on the original MMX code written by Joachim Henke that used
+# MMX intrinsics called from C. Many thanks to Joachim for that!
+#
+# No additional stack space is used; all storage is done in registers. The
+# arguments on entry:
+#
+#   struct decorr_pass *dpp     [ebp+8]
+#   int32_t *in_buffer          [ebp+12]
+#   int32_t *out_buffer         [ebp+16]
+#   int32_t sample_count        [ebp+20]
+#
+# During the processing loops, the following registers are used:
+#
+#   edi         input buffer pointer
+#   esi         direction (-8 forward, +8 reverse)
+#   ebx         delta from input to output buffer
+#   ecx         sample count
+#   edx         sign (dir) * term * -8 (terms 1-8 only)
+#   mm0, mm1    scratch
+#   mm2         original sample values
+#   mm3         correlation samples
+#   mm4         weight sums
+#   mm5         weights
+#   mm6         delta
+#   mm7         512 (for rounding)
+#
+
+_pack_decorr_stereo_pass_cont_rev_x86:
+pack_decorr_stereo_pass_cont_rev_x86:
+        push    ebp
+        mov     ebp, esp
+        push    ebx                         # save the registers that we need to
+        push    esi
+        push    edi
+
+        mov     esi, 8                      # esi indicates direction (inverted)
+        jmp     start
+
+_pack_decorr_stereo_pass_cont_x86:
+pack_decorr_stereo_pass_cont_x86:
+        push    ebp
+        mov     ebp, esp
+        push    ebx                         # save the registers that we need to
+        push    esi
+        push    edi
+
+        mov     esi, -8                     # esi indicates direction (inverted)
+
+start:  mov     eax, 512
+        movd    mm7, eax
+        punpckldq mm7, mm7                  # mm7 = round (512)
+
+        mov     eax, [ebp+8]                # access dpp
+        mov     eax, [eax+4]
+        movd    mm6, eax
+        punpckldq mm6, mm6                  # mm6 = delta (0-7)
+
+        mov     eax, [ebp+8]                # access dpp
+        movq    mm5, [eax+8]                # mm5 = weight_AB
+        movq    mm4, [eax+88]               # mm4 = sum_AB
+
+        mov     edi, [ebp+12]               # edi = in_buffer
+        mov     ebx, [ebp+16]
+        sub     ebx, edi                    # ebx = delta to output buffer
+
+        mov     ecx, [ebp+20]               # ecx = sample_count
+        test    ecx, ecx
+        jz      done
+
+        mov     eax, [ebp+8]                # *eax = dpp
+        mov     eax, [eax]                  # get term and vector to correct loop
+        cmp     eax, 17
+        je      term_17_loop
+        cmp     eax, 18
+        je      term_18_loop
+        cmp     eax, -1
+        je      term_minus_1_loop
+        cmp     eax, -2
+        je      term_minus_2_loop
+        cmp     eax, -3
+        je      term_minus_3_loop
+
+        shl     eax, 3
+        mov     edx, eax                    # edx = term * 8 to index correlation sample
+        test    esi, esi                    # test direction
+        jns     default_term_loop
+        neg     edx
+        jmp     default_term_loop
+
+        .balign  64
+
+default_term_loop:
+        movq    mm3, [edi+edx]              # mm3 = sam_AB
+
+        movq    mm1, mm3
+        pslld   mm1, 17
+        psrld   mm1, 17
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        pslld   mm0, 1
+        psrld   mm0, 16
+        pmaddwd mm0, mm5
+
+        movq    mm2, [edi]                  # mm2 = left_right
+        pslld   mm0, 5
+        paddd   mm1, mm7                    # add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    # add shifted sums
+        movq    mm0, mm3
+        movq    [edi+ebx], mm2              # store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
+        sub     edi, esi
+        pxor    mm1, mm1                    # mm1 = zero
+        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    # mm2 = 1s if either was zero
+        pandn   mm2, mm6                    # mask delta with zeros check
+        pxor    mm5, mm0
+        paddd   mm5, mm2                    # and add to weight_AB
+        pxor    mm5, mm0
+        paddd   mm4, mm5                    # add weights to sum
+        dec     ecx
+        jnz     default_term_loop
+
+        mov     eax, [ebp+8]                # access dpp
+        movq    [eax+8], mm5                # put weight_AB back
+        movq    [eax+88], mm4               # put sum_AB back
+        emms
+
+        mov     edx, [ebp+8]                # access dpp with edx
+        mov     ecx, [edx]                  # ecx = dpp->term
+
+default_store_samples:
+        dec     ecx
+        add     edi, esi                    # back up one full sample
+        mov     eax, [edi+4]
+        mov     [edx+ecx*4+48], eax         # store samples_B [ecx]
+        mov     eax, [edi]
+        mov     [edx+ecx*4+16], eax         # store samples_A [ecx]
+        test    ecx, ecx
+        jnz     default_store_samples
+        jmp     done
+
+        .balign  64
+
+term_17_loop:
+        movq    mm3, [edi+esi]              # get previous calculated value
+        paddd   mm3, mm3
+        psubd   mm3, [edi+esi*2]
+
+        movq    mm1, mm3
+        pslld   mm1, 17
+        psrld   mm1, 17
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        pslld   mm0, 1
+        psrld   mm0, 16
+        pmaddwd mm0, mm5
+
+        movq    mm2, [edi]                  # mm2 = left_right
+        pslld   mm0, 5
+        paddd   mm1, mm7                    # add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    # add shifted sums
+        movq    mm0, mm3
+        movq    [edi+ebx], mm2              # store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
+        sub     edi, esi
+        pxor    mm1, mm1                    # mm1 = zero
+        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    # mm2 = 1s if either was zero
+        pandn   mm2, mm6                    # mask delta with zeros check
+        pxor    mm5, mm0
+        paddd   mm5, mm2                    # and add to weight_AB
+        pxor    mm5, mm0
+        paddd   mm4, mm5                    # add weights to sum
+        dec     ecx
+        jnz     term_17_loop
+
+        mov     eax, [ebp+8]                # access dpp
+        movq    [eax+8], mm5                # put weight_AB back
+        movq    [eax+88], mm4               # put sum_AB back
+        emms
+        jmp     term_1718_common_store
+
+        .balign  64
+
+term_18_loop:
+        movq    mm3, [edi+esi]              # get previous calculated value
+        movq    mm0, mm3
+        psubd   mm3, [edi+esi*2]
+        psrad   mm3, 1
+        paddd   mm3, mm0                    # mm3 = sam_AB
+
+        movq    mm1, mm3
+        pslld   mm1, 17
+        psrld   mm1, 17
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        pslld   mm0, 1
+        psrld   mm0, 16
+        pmaddwd mm0, mm5
+
+        movq    mm2, [edi]                  # mm2 = left_right
+        pslld   mm0, 5
+        paddd   mm1, mm7                    # add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    # add shifted sums
+        movq    mm0, mm3
+        movq    [edi+ebx], mm2              # store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
+        sub     edi, esi
+        pxor    mm1, mm1                    # mm1 = zero
+        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    # mm2 = 1s if either was zero
+        pandn   mm2, mm6                    # mask delta with zeros check
+        pxor    mm5, mm0
+        paddd   mm5, mm2                    # and add to weight_AB
+        pxor    mm5, mm0
+        dec     ecx
+        paddd   mm4, mm5                    # add weights to sum
+        jnz     term_18_loop
+
+        mov     eax, [ebp+8]                # access dpp
+        movq    [eax+8], mm5                # put weight_AB back
+        movq    [eax+88], mm4               # put sum_AB back
+        emms
+
+term_1718_common_store:
+
+        mov     eax, [ebp+8]                # access dpp
+        add     edi, esi                    # back up a full sample
+        mov     edx, [edi+4]                # dpp->samples_B [0] = iptr [-1];
+        mov     [eax+48], edx
+        mov     edx, [edi]                  # dpp->samples_A [0] = iptr [-2];
+        mov     [eax+16], edx
+        add     edi, esi                    # back up another sample
+        mov     edx, [edi+4]                # dpp->samples_B [1] = iptr [-3];
+        mov     [eax+52], edx
+        mov     edx, [edi]                  # dpp->samples_A [1] = iptr [-4];
+        mov     [eax+20], edx
+        jmp     done
+
+        .balign  64
+
+term_minus_1_loop:
+        movq    mm3, [edi+esi]              # mm3 = previous calculated value
+        movq    mm2, [edi]                  # mm2 = left_right
+        psrlq   mm3, 32
+        punpckldq mm3, mm2                  # mm3 = sam_AB
+
+        movq    mm1, mm3
+        pslld   mm1, 17
+        psrld   mm1, 17
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        pslld   mm0, 1
+        psrld   mm0, 16
+        pmaddwd mm0, mm5
+
+        pslld   mm0, 5
+        paddd   mm1, mm7                    # add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    # add shifted sums
+        movq    mm0, mm3
+        movq    [edi+ebx], mm2              # store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
+        sub     edi, esi
+        pxor    mm1, mm1                    # mm1 = zero
+        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    # mm2 = 1s if either was zero
+        pandn   mm2, mm6                    # mask delta with zeros check
+        pcmpeqd mm1, mm1
+        psubd   mm1, mm7
+        psubd   mm1, mm7
+        psubd   mm1, mm0
+        pxor    mm5, mm0
+        paddd   mm5, mm1
+        paddusw mm5, mm2                    # and add to weight_AB
+        psubd   mm5, mm1
+        pxor    mm5, mm0
+        paddd   mm4, mm5                    # add weights to sum
+        dec     ecx
+        jnz     term_minus_1_loop
+
+        mov     eax, [ebp+8]                # access dpp
+        movq    [eax+8], mm5                # put weight_AB back
+        movq    [eax+88], mm4               # put sum_AB back
+        emms
+
+        add     edi, esi                    # back up a full sample
+        mov     edx, [edi+4]                # dpp->samples_A [0] = iptr [-1];
+        mov     eax, [ebp+8]
+        mov     [eax+16], edx
+        jmp     done
+
+        .balign  64
+
+term_minus_2_loop:
+        movq    mm2, [edi]                  # mm2 = left_right
+        movq    mm3, mm2                    # mm3 = swap dwords
+        psrlq   mm3, 32
+        punpckldq mm3, [edi+esi]            # mm3 = sam_AB
+
+        movq    mm1, mm3
+        pslld   mm1, 17
+        psrld   mm1, 17
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        pslld   mm0, 1
+        psrld   mm0, 16
+        pmaddwd mm0, mm5
+
+        pslld   mm0, 5
+        paddd   mm1, mm7                    # add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    # add shifted sums
+        movq    mm0, mm3
+        movq    [edi+ebx], mm2              # store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
+        sub     edi, esi
+        pxor    mm1, mm1                    # mm1 = zero
+        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    # mm2 = 1s if either was zero
+        pandn   mm2, mm6                    # mask delta with zeros check
+        pcmpeqd mm1, mm1
+        psubd   mm1, mm7
+        psubd   mm1, mm7
+        psubd   mm1, mm0
+        pxor    mm5, mm0
+        paddd   mm5, mm1
+        paddusw mm5, mm2                    # and add to weight_AB
+        psubd   mm5, mm1
+        pxor    mm5, mm0
+        paddd   mm4, mm5                    # add weights to sum
+        dec     ecx
+        jnz     term_minus_2_loop
+
+        mov     eax, [ebp+8]                # access dpp
+        movq    [eax+8], mm5                # put weight_AB back
+        movq    [eax+88], mm4               # put sum_AB back
+        emms
+
+        add     edi, esi                    # back up a full sample
+        mov     edx, [edi]                  # dpp->samples_B [0] = iptr [-2];
+        mov     eax, [ebp+8]
+        mov     [eax+48], edx
+        jmp     done
+
+        .balign  64
+
+term_minus_3_loop:
+        movq    mm0, [edi+esi]              # mm0 = previous calculated value
+        movq    mm3, mm0                    # mm3 = swap dwords
+        psrlq   mm3, 32
+        punpckldq mm3, mm0                  # mm3 = sam_AB
+
+        movq    mm1, mm3
+        pslld   mm1, 17
+        psrld   mm1, 17
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        pslld   mm0, 1
+        psrld   mm0, 16
+        pmaddwd mm0, mm5
+
+        movq    mm2, [edi]                  # mm2 = left_right
+        pslld   mm0, 5
+        paddd   mm1, mm7                    # add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    # add shifted sums
+        movq    mm0, mm3
+        movq    [edi+ebx], mm2              # store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
+        sub     edi, esi
+        pxor    mm1, mm1                    # mm1 = zero
+        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    # mm2 = 1s if either was zero
+        pandn   mm2, mm6                    # mask delta with zeros check
+        pcmpeqd mm1, mm1
+        psubd   mm1, mm7
+        psubd   mm1, mm7
+        psubd   mm1, mm0
+        pxor    mm5, mm0
+        paddd   mm5, mm1
+        paddusw mm5, mm2                    # and add to weight_AB
+        psubd   mm5, mm1
+        pxor    mm5, mm0
+        paddd   mm4, mm5                    # add weights to sum
+        dec     ecx
+        jnz     term_minus_3_loop
+
+        mov     eax, [ebp+8]                # access dpp
+        movq    [eax+8], mm5                # put weight_AB back
+        movq    [eax+88], mm4               # put sum_AB back
+        emms
+
+        add     edi, esi                    # back up a full sample
+        mov     edx, [edi+4]                # dpp->samples_A [0] = iptr [-1];
+        mov     eax, [ebp+8]
+        mov     [eax+16], edx
+        mov     edx, [edi]                  # dpp->samples_B [0] = iptr [-2];
+        mov     [eax+48], edx
+
+done:   pop     edi
+        pop     esi
+        pop     ebx
+        leave
+        ret
+
+
+# This is an assembly optimized version of the following WavPack function:
+#
+# uint32_t decorr_mono_buffer (int32_t *buffer,
+#                              struct decorr_pass *decorr_passes,
+#                              int32_t num_terms,
+#                              int32_t sample_count)
+#
+# Decorrelate a buffer of mono samples, in place, as specified by the array
+# of decorr_pass structures. Note that this function does NOT return the
+# dpp->samples_X[] values in the "normalized" positions for terms 1-8, so if
+# the number of samples is not a multiple of MAX_TERM, these must be moved if
+# they are to be used somewhere else. The magnitude of the output samples is
+# accumulated and returned (see scan_max_magnitude() for more details). By
+# using the overflow detection of the multiply instruction, this detects
+# when the "long_math" varient is required.
+#
+# For the fastest possible operation with the four "common" decorrelation
+# filters (i.e, fast, normal, high and very high) this function can be
+# configured to include hardcoded versions of these filters that are created
+# using macros. In that case, the passed filter is checked to make sure that
+# it matches one of the four. If it doesn't, or if the hardcoded flters are
+# not enabled, a "general" version of the decorrelation loop is used. This
+# variable enables the hardcoded filters and can be disabled if there are
+# problems with the code or macros:
+
+        HARDCODED_FILTERS = 1
+
+# This is written to work on an IA-32 processor. The arguments are on the
+# stack at these locations (after 6 pushes, we do not use ebp as a base
+# pointer):
+#
+#   int32_t *buffer             [esp+28]
+#   struct decorr_pass *dpp     [esp+32]
+#   int32_t num_terms           [esp+36]
+#   int32_t sample_count        [esp+40]
+#
+# register usage:
+#
+# ecx = sample being decorrelated
+# esi = sample up counter
+# edi = *buffer
+# ebp = *dpp
+#
+# stack usage:
+#
+# [esp+0] = dpp end ptr (unused in hardcoded filter case)
+# [esp+4] = magnitude accumulator
+#
+        .if     HARDCODED_FILTERS
+#
+# This macro is used for checking the decorr_passes array to make sure that the terms match
+# the hardcoded terms. The terms of these filters are the first element in the tables defined
+# in decorr_tables.h (with the negative terms replaced with 1).
+#
+
+        .macro  chkterm term ebp_offset
+        cmp     BYTE PTR [ebp], \term
+        jnz     use_general_version
+        add     ebp, \ebp_offset
+        .endm
+#
+# This macro processes the single specified term (with a fixed delta of 2) and updates the
+# term pointer (rbp) with the specified offset when done. It assumes the following registers:
+#
+# ecx = sample being decorrelated
+# esi = sample up counter (used for terms 1-8)
+# rbp = decorr_pass pointer for this term (updated with "rbp_offset" when done)
+# rax, rbx, rdx = scratch
+#
+        .macro  exeterm term ebp_offset
+
+        .if     \term <= 8
+        mov     eax, esi
+        and     eax, 7
+        mov     ebx, [ebp+16+eax*4]
+        .if     \term != 8
+        add     eax, \term
+        and     eax, 7
+        .endif
+        mov     [ebp+16+eax*4], ecx
+
+        .elseif     \term == 17
+
+        mov     edx, [ebp+16]               # handle term 17
+        mov     [ebp+16], ecx
+        lea     ebx, [edx+edx]
+        sub     ebx, [ebp+20]
+        mov     [ebp+20], edx
+
+        .else
+
+        mov     edx, [ebp+16]               # handle term 18
+        mov     [ebp+16], ecx
+        lea     ebx, [edx+edx*2]
+        sub     ebx, [ebp+20]
+        sar     ebx, 1
+        mov     [ebp+20], edx
+
+        .endif
+
+        mov     eax, [ebp+8]
+        imul    eax, ebx                    # 32-bit multiply is almost always enough
+        jo      1f                          # but handle overflow if it happens
+        sar     eax, 10
+        sbb     ecx, eax                    # borrow flag provides rounding
+        jmp     2f
+1:      mov     eax, [ebp+8]                # perform 64-bit multiply on overflow
+        imul    ebx
+        shr     eax, 10
+        sbb     ecx, eax
+        shl     edx, 22
+        sub     ecx, edx
+2:      je      3f
+        test    ebx, ebx
+        je      3f
+        xor     ebx, ecx
+        sar     ebx, 30
+        or      ebx, 1                      # this generates delta of 1
+        shl     ebx, 1                      # this generates delta of 2
+        add     [ebp+8], ebx
+3:      add     ebp, \ebp_offset
+
+        .endm
+
+        .endif                              # end of macro definitions
+
+# entry point of function
+
+_pack_decorr_mono_buffer_x86:
+pack_decorr_mono_buffer_x86:
+        push    ebp                         # save the resgister that we need to
+        push    ebx
+        push    esi
+        push    edi
+        xor     eax, eax
+        push    eax                         # this is magnitude accumulator
+        push    eax                         # this will be dpp end ptr
+
+        mov     edi, [esp+28]               # edi is buffer pointer
+        xor     esi, esi                    # up counter = 0
+
+        cmp     DWORD PTR [esp+40], 0       # test & handle zero sample count & zero term count
+        jz      mexit
+        cmp     DWORD PTR [esp+36], 0
+        jz      mexit
+
+        .if     HARDCODED_FILTERS
+
+# first check to make sure all the "deltas" are 2
+
+        mov     ebp, [esp+32]               # ebp is decorr_pass pointer
+        mov     ebx, [esp+36]               # get term count
+deltas: cmp     BYTE PTR [ebp+4], 2         # make sure all the deltas are 2
+        jnz     use_general_version         # if any aren't, use general case
+        add     ebp, 96
+        dec     ebx
+        jnz     deltas
+
+        mov     ebp, [esp+32]               # ebp is decorr_pass pointer
+        mov     edx, [esp+36]               # get term count
+        cmp     dl, 2                       # 2 terms is "fast"
+        jnz     nfast
+        chkterm 18,  96                     # check "fast" terms
+        chkterm 17, -96
+        jmp     mono_fast_loop
+
+nfast:  cmp     dl, 5                       # 5 terms is "normal"
+        jnz     nnorm
+        chkterm 18, 96                      # check "normal" terms
+        chkterm 18, 96
+        chkterm 2,  96
+        chkterm 17, 96
+        chkterm 3,  96*-4
+        jmp     mono_normal_loop
+
+nnorm:  cmp     dl, 10                      # 10 terms is "high"
+        jnz     nhigh
+        chkterm 18, 96                      # check "high" terms
+        chkterm 18, 96
+        chkterm 18, 96
+        chkterm 1,  96
+        chkterm 2,  96
+        chkterm 3,  96
+        chkterm 5,  96
+        chkterm 1,  96
+        chkterm 17, 96
+        chkterm 4,  96*-9
+        jmp     mono_high_loop
+
+nhigh:  cmp     dl, 16                      # 16 terms is "very high"
+        jnz     use_general_version         # if none of these, use general version
+        chkterm 18, 96                      # else check "very high" terms
+        chkterm 18, 96
+        chkterm 2,  96
+        chkterm 3,  96
+        chkterm 1,  96
+        chkterm 18, 96
+        chkterm 2,  96
+        chkterm 4,  96
+        chkterm 7,  96
+        chkterm 5,  96
+        chkterm 3,  96
+        chkterm 6,  96
+        chkterm 8,  96
+        chkterm 1,  96
+        chkterm 18, 96
+        chkterm 2,  96*-15
+        jmp     mono_vhigh_loop
+
+        .balign  64
+
+mono_fast_loop:
+        mov     ecx, [edi+esi*4]             # ecx is the sample we're decorrelating
+
+        exeterm 18,  96
+        exeterm 17, -96
+
+        mov     [edi+esi*4], ecx            # store completed sample
+        mov     eax, ecx                    # magnitude accumulator |= (sample < 0) ? ~sample : sample
+        cdq
+        xor     eax, edx
+        or      [esp+4], eax
+        inc     esi                         # increment sample index
+        cmp     esi, [esp+40]
+        jnz     mono_fast_loop              # loop back for all samples
+        jmp     mexit
+
+        .balign  64
+
+mono_normal_loop:
+        mov     ecx, [edi+esi*4]             # ecx is the sample we're decorrelating
+
+        exeterm 18, 96
+        exeterm 18, 96
+        exeterm 2,  96
+        exeterm 17, 96
+        exeterm 3,  96*-4
+
+        mov     [edi+esi*4], ecx            # store completed sample
+        mov     eax, ecx                    # magnitude accumulator |= (sample < 0) ? ~sample : sample
+        cdq
+        xor     eax, edx
+        or      [esp+4], eax
+        inc     esi                         # increment sample index
+        cmp     esi, [esp+40]
+        jnz     mono_normal_loop            # loop back for all samples
+        jmp     mexit
+
+        .balign  64
+
+mono_high_loop:
+        mov     ecx, [edi+esi*4]             # ecx is the sample we're decorrelating
+
+        exeterm 18, 96
+        exeterm 18, 96
+        exeterm 18, 96
+        exeterm 1,  96
+        exeterm 2,  96
+        exeterm 3,  96
+        exeterm 5,  96
+        exeterm 1,  96
+        exeterm 17, 96
+        exeterm 4,  96*-9
+
+        mov     [edi+esi*4], ecx            # store completed sample
+        mov     eax, ecx                    # magnitude accumulator |= (sample < 0) ? ~sample : sample
+        cdq
+        xor     eax, edx
+        or      [esp+4], eax
+        inc     esi                         # increment sample index
+        cmp     esi, [esp+40]
+        jnz     mono_high_loop              # loop back for all samples
+        jmp     mexit
+
+        .balign  64
+
+mono_vhigh_loop:
+        mov     ecx, [edi+esi*4]             # ecx is the sample we're decorrelating
+
+        exeterm 18, 96
+        exeterm 18, 96
+        exeterm 2,  96
+        exeterm 3,  96
+        exeterm 1,  96
+        exeterm 18, 96
+        exeterm 2,  96
+        exeterm 4,  96
+        exeterm 7,  96
+        exeterm 5,  96
+        exeterm 3,  96
+        exeterm 6,  96
+        exeterm 8,  96
+        exeterm 1,  96
+        exeterm 18, 96
+        exeterm 2,  96*-15
+
+        mov     [edi+esi*4], ecx            # store completed sample
+        mov     eax, ecx                    # magnitude accumulator |= (sample < 0) ? ~sample : sample
+        cdq
+        xor     eax, edx
+        or      [esp+4], eax
+        inc     esi                         # increment sample index
+        cmp     esi, [esp+40]
+        jnz     mono_vhigh_loop             # loop back for all samples
+        jmp     mexit
+
+        .endif
+
+use_general_version:
+        mov     ebp, [esp+32]
+        mov     edx, [esp+36]               # get number of terms
+        imul    eax, edx, 96                # calculate & store termination check ptr
+        add     eax, [esp+32]
+        mov     [esp], eax
+        jmp     decorrelate_loop
+
+        .balign  64
+
+decorrelate_loop:
+        mov     ecx, [edi+esi*4]             # ecx is the sample we're decorrelating
+nxterm: mov     edx, [ebp]
+        cmp     dl, 17
+        jge     3f
+
+        mov     eax, esi
+        and     eax, 7
+        mov     ebx, [ebp+16+eax*4]
+        add     eax, edx
+        and     eax, 7
+        mov     [ebp+16+eax*4], ecx
+        jmp     domult
+
+        .balign  4
+3:      mov     edx, [ebp+16]
+        mov     [ebp+16], ecx
+        je      4f
+        lea     ebx, [edx+edx*2]
+        sub     ebx, [ebp+20]
+        sar     ebx, 1
+        mov     [ebp+20], edx
+        jmp     domult
+
+        .balign  4
+4:      lea     ebx, [edx+edx]
+        sub     ebx, [ebp+20]
+        mov     [ebp+20], edx
+
+domult: mov     eax, [ebp+8]
+        mov     edx, eax
+        imul    eax, ebx
+        jo      multov                      # on overflow, jump to use 64-bit imul varient
+        sar     eax, 10
+        sbb     ecx, eax
+        je      2f
+        test    ebx, ebx
+        je      2f
+        xor     ebx, ecx
+        sar     ebx, 31
+        xor     edx, ebx
+        add     edx, [ebp+4]
+        xor     edx, ebx
+        mov     [ebp+8], edx
+2:      add     ebp, 96
+        cmp     ebp, [esp]
+        jnz     nxterm
+
+        mov     [edi+esi*4], ecx            # store completed sample
+        mov     eax, ecx                    # magnitude accumulator |= (sample < 0) ? ~sample : sample
+        cdq
+        xor     eax, edx
+        or      [esp+4], eax
+        mov     ebp, [esp+32]               # reload decorr_passes pointer to first term
+        inc     esi                         # increment sample index
+        cmp     esi, [esp+40]
+        jnz     decorrelate_loop
+        jmp     mexit
+
+        .balign  4
+multov: mov     eax, [ebp+8]
+        imul    ebx
+        shr     eax, 10
+        sbb     ecx, eax
+        shl     edx, 22
+        sub     ecx, edx
+        je      2f
+        test    ebx, ebx
+        je      2f
+        xor     ebx, ecx
+        sar     ebx, 31
+        mov     eax, [ebp+8]
+        xor     eax, ebx
+        add     eax, [ebp+4]
+        xor     eax, ebx
+        mov     [ebp+8], eax
+2:      add     ebp, 96
+        cmp     ebp, [esp]
+        jnz     nxterm
+
+        mov     [edi+esi*4], ecx            # store completed sample
+        mov     eax, ecx                    # magnitude accumulator |= (sample < 0) ? ~sample : sample
+        cdq
+        xor     eax, edx
+        or      [esp+4], eax
+        mov     ebp, [esp+32]               # reload decorr_passes pointer to first term
+        inc     esi                         # increment sample index
+        cmp     esi, [esp+40]
+        jnz     decorrelate_loop            # loop all the way back this time
+
+mexit:  pop     eax
+        pop     eax                         # pop magnitude accumulator
+        pop     edi
+        pop     esi
+        pop     ebx
+        pop     ebp
+        ret
+
+
+# This is an assembly optimized version of the following WavPack function:
+#
+# void decorr_mono_pass_cont (int32_t *out_buffer,
+#                             int32_t *in_buffer,
+#                             struct decorr_pass *dpp,
+#                             int32_t sample_count);
+#
+# It performs a single pass of mono decorrelation, transfering from the
+# input buffer to the output buffer. Note that this version of the function
+# requires that the up to 8 previous (depending on dpp->term) mono samples
+# are visible and correct. In other words, it ignores the "samples_*"
+# fields in the decorr_pass structure and gets the history data directly
+# from the source buffer. It does, however, return the appropriate history
+# samples to the decorr_pass structure before returning.
+#
+# By using the overflow detection of the multiply instruction, it detects
+# when the "long_math" varient is required and automatically does it.
+#
+# This is written to work on an IA-32 processor. The arguments on entry:
+#
+#   int32_t *out_buffer         [ebp+8]
+#   int32_t *in_buffer          [ebp+12]
+#   struct decorr_pass *dpp     [ebp+16]
+#   int32_t sample_count        [ebp+20]
+#
+# Register / stack usage:
+#
+# esi = source ptr
+# edi = destination ptr
+# ecx = term * -4 (default terms)
+# ecx = previous sample (terms 17 & 18)
+# ebp = weight
+# [esp] = delta
+# [esp+4] = weight sum
+# [esp+8] = eptr
+#
+
+_pack_decorr_mono_pass_cont_x86:
+pack_decorr_mono_pass_cont_x86:
+        push    ebp
+        mov     ebp, esp
+        push    ebx                         # save the registers that we need to
+        push    esi
+        push    edi
+        cld
+
+        mov     esi, [ebp+12]
+        mov     edi, [ebp+8]
+        mov     edx, [ebp+16]               # edx = *dpp
+        mov     ecx, [ebp+20]               # ecx = sample count
+        mov     ebp, [edx+8]                # ebp = weight
+        lea     eax, [esi+ecx*4]            # calc & push eptr (access with [esp+8])
+        push    eax
+        mov     eax, [edx+88]               # push dpp->sum_A (access with [esp+4])
+        push    eax
+        mov     eax, [edx+4]                # push delta (access with [esp])
+        push    eax
+        test    ecx, ecx                    # test for and handle zero count
+        jz      mono_done
+
+        cld                                 # we use lodsd/stosd
+        mov     ecx, [esi-4]                # preload last sample
+        mov     eax, [edx]                  # get term & branch for terms 17 & 18
+        cmp     eax, 17
+        je      mono_term_17_loop
+        cmp     eax, 18
+        je      mono_term_18_loop
+
+        imul    ecx, eax, -4                # ecx is index to correlation sample now
+        jmp     mono_default_term_loop
+
+        .balign  64
+
+mono_default_term_loop:
+        mov     edx, [esi+ecx]
+        mov     ebx, edx
+        imul    edx, ebp
+        jo      1f
+        lodsd
+        sar     edx, 10
+        sbb     eax, edx
+        jmp     2f
+1:      mov     eax, ebx
+        imul    ebp
+        shl     edx, 22
+        shr     eax, 10
+        adc     edx, eax                    # edx = apply_weight (sam_A)
+        lodsd
+        sub     eax, edx
+2:      stosd
+        je      3f
+        test    ebx, ebx
+        je      3f
+        xor     eax, ebx
+        cdq
+        xor     ebp, edx
+        add     ebp, [esp]
+        xor     ebp, edx
+3:      add     [esp+4], ebp
+        cmp     esi, [esp+8]
+        jnz     mono_default_term_loop
+
+        mov     ecx, ebp                    # ecx = weight
+        mov     eax, [esp+4]                # eax = weight sum
+        lea     ebp, [esp+24]               # restore ebp (we've pushed 6 DWORDS)
+        mov     edx, [ebp+16]               # edx = *dpp
+        mov     [edx+8], ecx                # put weight back
+        mov     [edx+88], eax               # put dpp->sum_A back
+        mov     ecx, [edx]                  # ecx = dpp->term
+
+mono_default_store_samples:
+        dec     ecx
+        sub     esi, 4                      # back up one sample
+        mov     eax, [esi]
+        mov     [edx+ecx*4+16], eax         # store samples_A [ecx]
+        test    ecx, ecx
+        jnz     mono_default_store_samples
+        jmp     mono_done
+
+        .balign  64
+
+mono_term_17_loop:
+        lea     edx, [ecx+ecx]
+        sub     edx, [esi-8]                # ebx = sam_A
+        mov     ebx, edx
+        imul    edx, ebp
+        jo      1f
+        sar     edx, 10
+        lodsd
+        mov     ecx, eax
+        sbb     eax, edx
+        jmp     2f
+1:      mov     eax, ebx
+        imul    ebp
+        shl     edx, 22
+        shr     eax, 10
+        adc     edx, eax                    # edx = apply_weight (sam_A)
+        lodsd
+        mov     ecx, eax
+        sub     eax, edx
+2:      stosd
+        je      3f
+        test    ebx, ebx
+        je      3f
+        xor     eax, ebx
+        cdq
+        xor     ebp, edx
+        add     ebp, [esp]
+        xor     ebp, edx
+3:      add     [esp+4], ebp
+        cmp     esi, [esp+8]
+        jnz     mono_term_17_loop
+        jmp     mono_term_1718_exit
+
+        .balign  64
+
+mono_term_18_loop:
+        lea     edx, [ecx+ecx*2]
+        sub     edx, [esi-8]
+        sar     edx, 1
+        mov     ebx, edx                    # ebx = sam_A
+        imul    edx, ebp
+        jo      1f
+        sar     edx, 10
+        lodsd
+        mov     ecx, eax
+        sbb     eax, edx
+        jmp     2f
+1:      mov     eax, ebx
+        imul    ebp
+        shl     edx, 22
+        shr     eax, 10
+        adc     edx, eax                    # edx = apply_weight (sam_A)
+        lodsd
+        mov     ecx, eax
+        sub     eax, edx
+2:      stosd
+        je      3f
+        test    ebx, ebx
+        je      3f
+        xor     eax, ebx
+        cdq
+        xor     ebp, edx
+        add     ebp, [esp]
+        xor     ebp, edx
+3:      add     [esp+4], ebp
+        cmp     esi, [esp+8]
+        jnz     mono_term_18_loop
+
+mono_term_1718_exit:
+        mov     ecx, ebp                    # ecx = weight
+        mov     eax, [esp+4]                # eax = weight sum
+        lea     ebp, [esp+24]               # restore ebp (we've pushed 6 DWORDS)
+        mov     edx, [ebp+16]               # edx = *dpp
+        mov     [edx+8], ecx                # put weight back
+        mov     [edx+88], eax               # put dpp->sum_A back
+        mov     eax, [esi-4]                # dpp->samples_A [0] = bptr [-1]
+        mov     [edx+16], eax
+        mov     eax, [esi-8]                # dpp->samples_A [1] = bptr [-2]
+        mov     [edx+20], eax
+
+mono_done:
+        add     esp, 12                     # deallocate stack space
+        pop     edi                         # pop saved registers & return
+        pop     esi
+        pop     ebx
+        pop     ebp
+        ret
+
+
+# This is an assembly optimized version of the following WavPack function:
+#
+# uint32_t scan_max_magnitude (int32_t *buffer, int32_t sample_count);
+#
+# This function scans a buffer of signed 32-bit ints and returns the magnitude
+# of the largest sample, with a power-of-two resolution. It might be more
+# useful to return the actual maximum absolute value, but that implementation
+# would be slower. Instead, this simply returns the "or" of all the values
+# "xor"d with their own sign, like so:
+#
+#     while (sample_count--)
+#         magnitude |= (*buffer < 0) ? ~*buffer++ : *buffer++;
+#
+# This is written to work on an IA-32 processor and uses the MMX extensions
+# to improve the performance by processing two samples together. The arguments
+# are on the stack at these locations (after 4 pushes, we do not use ebp as a
+# base pointer):
+#
+#   int32_t *buffer             [esp+20]
+#   uint32_t sample_count       [esp+24]
+#
+# During the processing loops, the following registers are used:
+#
+#   edi         buffer pointer
+#   esi         termination buffer pointer
+#   ebx         single magnitude accumulator
+#   mm0         dual magnitude accumulator
+#   mm1, mm2    scratch
+#
+
+_scan_max_magnitude_x86:
+scan_max_magnitude_x86:
+        push    ebp
+        push    ebx
+        push    esi
+        push    edi
+
+        xor     ebx, ebx                    # clear magnitude accumulator
+        mov     edi, [esp+20]               # edi = buffer pointer
+
+        mov     eax, [esp+24]               # eax = count
+        and     eax, 7
+        mov     ecx, eax                    # ecx = leftover samples to "manually" scan at end
+
+        mov     eax, [esp+24]               # eax = count
+        shr     eax, 3                      # eax = num of loops to process mmx (8 samples/loop)
+        shl     eax, 5                      # eax = num of bytes to process mmx (32 bytes/loop)
+        jz      nommx                       # jump around if no mmx loops to do (< 8 samples)
+
+        pxor    mm0, mm0                    # clear dual magnitude accumulator
+        add     eax, edi                    # esi = termination buffer pointer for mmx loop
+        mov     esi, eax
+        jmp     mmxlp
+
+        .balign  64
+
+mmxlp:  movq    mm1, [edi]                  # get stereo samples in mm1 & mm2
+        movq    mm2, mm1
+        psrad   mm1, 31                     # mm1 = sign (mm2)
+        pxor    mm1, mm2                    # mm1 = absolute magnitude, or into result
+        por     mm0, mm1
+
+        movq    mm1, [edi+8]                # do it again with 6 more samples
+        movq    mm2, mm1
+        psrad   mm1, 31
+        pxor    mm1, mm2
+        por     mm0, mm1
+
+        movq    mm1, [edi+16]
+        movq    mm2, mm1
+        psrad   mm1, 31
+        pxor    mm1, mm2
+        por     mm0, mm1
+
+        movq    mm1, [edi+24]
+        movq    mm2, mm1
+        psrad   mm1, 31
+        pxor    mm1, mm2
+        por     mm0, mm1
+
+        add     edi, 32
+        cmp     edi, esi
+        jnz     mmxlp
+
+        movd    eax, mm0                    # ebx = "or" of high and low mm0
+        punpckhdq mm0, mm0
+        movd    ebx, mm0
+        or      ebx, eax
+        emms
+
+nommx:  and     ecx, ecx                    # any leftover samples to do?
+        jz      noleft
+
+leftlp: mov     eax, [edi]
+        cdq
+        xor     eax, edx
+        or      ebx, eax
+        add     edi, 4
+        loop    leftlp
+
+noleft: mov     eax, ebx                    # move magnitude to eax for return
+        pop     edi
+        pop     esi
+        pop     ebx
+        pop     ebp
+        ret
+
+
+# This is an assembly optimized version of the following WavPack function:
+#
+# uint32_t log2buffer (int32_t *samples, uint32_t num_samples, int limit);
+#
+# This function scans a buffer of 32-bit ints and accumulates the total
+# log2 value of all the samples. This is useful for determining maximum
+# compression because the bitstream storage required for entropy coding
+# is proportional to the base 2 log of the samples.
+#
+# This is written to work on an IA-32 processor. The arguments are on the
+# stack at these locations (after 4 pushes, we do not use ebp as a base
+# pointer):
+#
+#   int32_t *samples            [esp+20]
+#   uint32_t num_samples        [esp+24]
+#   int limit                   [esp+28]
+#
+# During the processing loops, the following registers are used:
+#
+#   esi             input buffer pointer
+#   edi             sum accumulator
+#   ebx             sample count
+#   ebp             log2_table pointer
+#   eax,ecx,edx     scratch
+#
+
+        .balign  256
+
+log2_table:
+        .byte   0x00, 0x01, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0b, 0x0d, 0x0e, 0x10, 0x11, 0x12, 0x14, 0x15
+        .byte   0x16, 0x18, 0x19, 0x1a, 0x1c, 0x1d, 0x1e, 0x20, 0x21, 0x22, 0x24, 0x25, 0x26, 0x28, 0x29, 0x2a
+        .byte   0x2c, 0x2d, 0x2e, 0x2f, 0x31, 0x32, 0x33, 0x34, 0x36, 0x37, 0x38, 0x39, 0x3b, 0x3c, 0x3d, 0x3e
+        .byte   0x3f, 0x41, 0x42, 0x43, 0x44, 0x45, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4d, 0x4e, 0x4f, 0x50, 0x51
+        .byte   0x52, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63
+        .byte   0x64, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x74, 0x75
+        .byte   0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85
+        .byte   0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95
+        .byte   0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4
+        .byte   0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb2
+        .byte   0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc0
+        .byte   0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcb, 0xcc, 0xcd, 0xce
+        .byte   0xcf, 0xd0, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd8, 0xd9, 0xda, 0xdb
+        .byte   0xdc, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe4, 0xe5, 0xe6, 0xe7, 0xe7
+        .byte   0xe8, 0xe9, 0xea, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xee, 0xef, 0xf0, 0xf1, 0xf1, 0xf2, 0xf3, 0xf4
+        .byte   0xf4, 0xf5, 0xf6, 0xf7, 0xf7, 0xf8, 0xf9, 0xf9, 0xfa, 0xfb, 0xfc, 0xfc, 0xfd, 0xfe, 0xff, 0xff
+
+_log2buffer_x86:
+log2buffer_x86:
+        push    ebp
+        push    ebx
+        push    esi
+        push    edi
+        cld
+
+# These three instructions allow this to be PIC (position independent code). Having the hardcoded offset is
+# certainly not ideal, but it will probably work everywhere. The actual desired expression (nexti - log2_table)
+# would not compile on OS X.
+
+        call    nexti                       # push address of nexti (return address)
+nexti:  pop     ebp                         # pop address of nexti into ebp
+        sub     ebp, 266                    # offset to log2_table, should be (nexti - log2_table)
+
+        mov     esi, [esp+20]               # esi = sample source pointer
+        xor     edi, edi                    # edi = 0 (accumulator)
+        mov     ebx, [esp+24]               # ebx = num_samples
+        test    ebx, ebx                    # exit now if none, sum = 0
+        jz      normal_exit
+
+        mov     eax, [esp+28]               # eax = limit
+        test    eax, eax                    # we have separate loops for limit and no limit
+        jz      no_limit_loop
+        jmp     limit_loop
+
+        .balign  64
+
+limit_loop:
+        mov     eax, [esi]                  # get next sample into eax
+        cdq                                 # edx = sign of sample (for abs)
+        add     esi, 4
+        xor     eax, edx
+        sub     eax, edx
+        je      L40                         # skip if sample was zero
+        mov     edx, eax                    # move to edx and apply rounding
+        shr     eax, 9
+        add     edx, eax
+        bsr     ecx, edx                    # ecx = MSB set in sample (0 - 31)
+        lea     eax, [ecx+1]                # eax = number used bits in sample (1 - 32)
+        sub     ecx, 8                      # ecx = shift right amount (-8 to 23)
+        ror     edx, cl                     # use rotate to do "signed" shift 
+        shl     eax, 8                      # move nbits to integer portion of log
+        movzx   edx, dl                     # dl = mantissa, look up log fraction in table 
+        mov     al, [ebp+edx]               # eax = combined integer and fraction for full log
+        add     edi, eax                    # add to running sum and compare to limit
+        cmp     eax, [esp+28]
+        jge     limit_exceeded
+L40:    sub     ebx, 1                      # loop back if more samples
+        jne     limit_loop
+        jmp     normal_exit
+
+        .balign  64
+
+no_limit_loop:
+        mov     eax, [esi]                  # get next sample into eax
+        cdq                                 # edx = sign of sample (for abs)
+        add     esi, 4
+        xor     eax, edx
+        sub     eax, edx
+        je      L45                         # skip if sample was zero
+        mov     edx, eax                    # move to edx and apply rounding
+        shr     eax, 9
+        add     edx, eax
+        bsr     ecx, edx                    # ecx = MSB set in sample (0 - 31)
+        lea     eax, [ecx+1]                # eax = number used bits in sample (1 - 32)
+        sub     ecx, 8                      # ecx = shift right amount (-8 to 23)
+        ror     edx, cl                     # use rotate to do "signed" shift 
+        shl     eax, 8                      # move nbits to integer portion of log
+        movzx   edx, dl                     # dl = mantissa, look up log fraction in table 
+        mov     al, [ebp+edx]               # eax = combined integer and fraction for full log
+        add     edi, eax                    # add to running sum
+L45:    sub     ebx, 1                      # loop back if more samples
+        jne     no_limit_loop
+        jmp     normal_exit
+
+limit_exceeded:
+        mov     edi, -1                     # -1 return means log limit exceeded
+normal_exit:
+        mov     eax, edi                    # move sum accumulator into eax for return
+        pop     edi
+        pop     esi
+        pop     ebx
+        pop     ebp
+        ret
+
+# Helper function to determine if specified CPU feature is available (used here for MMX).
+# Input parameter is index of feature to be checked (EDX from CPUID(1) only, MMX = 23).
+# Return value is the specified bit (0 or 1) or 0 if CPUID is not supported.
+
+_pack_cpu_has_feature_x86:
+pack_cpu_has_feature_x86:
+        pushfd                              # save eflags
+        pushfd                              # push another copy
+        xor     dword ptr [esp], 0x200000   # toggle ID bit on stack & pop it back into eflags
+        popfd
+        pushfd                              # store possibly modified eflags
+        pop     eax                         # and pop back into eax
+        xor     eax, [esp]                  # compare to original pushed eflags
+        popfd                               # restore original eflags
+        and     eax, 0x200000               # eax = 1 if eflags ID bit was changable
+        jz      oldcpu                      # return zero if CPUID is not available (wow!)
+
+        push    ebx                         # we must save ebx
+        mov     eax, 1                      # do cpuid (1) to get features into edx
+        cpuid
+        mov     eax, edx                    # copy into eax for shift
+        mov     cl, [esp+8]                 # get parameter and shift that bit index into LSB
+        sar     eax, cl
+        and     eax, 1
+        pop     ebx                         # restore ebx and return 0 or 1
+
+oldcpu: ret                                 # return value in eax
+
+#ifdef __ELF__
+        .section .note.GNU-stack,"",@progbits
+#endif
+
diff --git a/third_party/wavpack/src/pack_x86.asm b/third_party/wavpack/src/pack_x86.asm
new file mode 100644
index 0000000..87b5f02
--- /dev/null
+++ b/third_party/wavpack/src/pack_x86.asm
@@ -0,0 +1,1827 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;                           **** WAVPACK ****                            ;;
+;;                  Hybrid Lossless Wavefile Compressor                   ;;
+;;              Copyright (c) 1998 - 2015 Conifer Software.               ;;
+;;                          All Rights Reserved.                          ;;
+;;      Distributed under the BSD Software License (see license.txt)      ;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+        .686
+        .mmx
+        .model  flat
+asmcode segment page 'CODE'
+        public  _pack_decorr_stereo_pass_x86
+        public  _pack_decorr_stereo_pass_cont_rev_x86
+        public  _pack_decorr_stereo_pass_cont_x86
+        public  _pack_decorr_mono_buffer_x86
+        public  _pack_decorr_mono_pass_cont_x86
+        public  _pack_cpu_has_feature_x86
+        public  _scan_max_magnitude_x86
+        public  _log2buffer_x86
+
+; This module contains X86 assembly optimized versions of functions required
+; to encode WavPack files.
+
+; This is an assembly optimized version of the following WavPack function:
+;
+; void pack_decorr_stereo_pass (
+;   struct decorr_pass *dpp,
+;   int32_t *buffer,
+;   int32_t sample_count);
+;
+; It performs a single pass of stereo decorrelation, in place, as specified
+; by the decorr_pass structure. Note that this function does NOT return the
+; dpp->samples_X[] values in the "normalized" positions for terms 1-8, so if
+; the number of samples is not a multiple of MAX_TERM, these must be moved if
+; they are to be used somewhere else.
+;
+; This is written to work on an IA-32 processor and uses the MMX extensions
+; to improve the performance by processing both stereo channels together.
+; It is based on the original MMX code written by Joachim Henke that used
+; MMX intrinsics called from C. Many thanks to Joachim for that!
+;
+; An issue with using MMX for this is that the sample history array in the
+; decorr_pass structure contains separate arrays for each channel while the
+; MMX code wants there to be a single array of dual samples. The fix for
+; this is to convert the data in the arrays on entry and exit, and this is
+; made easy by the fact that the 8 MMX regsiters hold exactly the required
+; amount of data (64 bytes)!
+;
+; This is written to work on an IA-32 processor. The arguments are on the
+; stack at these locations (after 4 pushes, we do not use ebp as a base
+; pointer):
+;
+;   struct decorr_pass *dpp   [esp+20]
+;   int32_t *buffer           [esp+24]
+;   int32_t sample_count      [esp+28]
+;
+; During the processing loops, the following registers are used:
+;
+;   edi         buffer pointer
+;   esi         termination buffer pointer
+;   eax,ebx,edx used in default term to reduce calculation         
+;   ebp         decorr_pass pointer
+;   mm0, mm1    scratch
+;   mm2         original sample values
+;   mm3         correlation samples
+;   mm4         0 (for pcmpeqd)
+;   mm5         weights
+;   mm6         delta
+;   mm7         512 (for rounding)
+;
+
+_pack_decorr_stereo_pass_x86:
+        push    ebp
+        push    ebx
+        push    edi
+        push    esi
+
+        mov     ebp, [esp+20]               ; ebp = *dpp
+        mov     edi, [esp+24]               ; edi = buffer
+        mov     esi, [esp+28]
+        sal     esi, 3
+        jz      bdone
+        add     esi, edi                    ; esi = termination buffer pointer
+
+        ; convert samples_A and samples_B array into samples_AB array for MMX
+        ; (the MMX registers provide exactly enough storage to do this easily)
+
+        movq        mm0, [ebp+16]
+        punpckldq   mm0, [ebp+48]
+        movq        mm1, [ebp+16]
+        punpckhdq   mm1, [ebp+48]
+        movq        mm2, [ebp+24]
+        punpckldq   mm2, [ebp+56]
+        movq        mm3, [ebp+24]
+        punpckhdq   mm3, [ebp+56]
+        movq        mm4, [ebp+32]
+        punpckldq   mm4, [ebp+64]
+        movq        mm5, [ebp+32]
+        punpckhdq   mm5, [ebp+64]
+        movq        mm6, [ebp+40]
+        punpckldq   mm6, [ebp+72]
+        movq        mm7, [ebp+40]
+        punpckhdq   mm7, [ebp+72]
+
+        movq    [ebp+16], mm0
+        movq    [ebp+24], mm1
+        movq    [ebp+32], mm2
+        movq    [ebp+40], mm3
+        movq    [ebp+48], mm4
+        movq    [ebp+56], mm5
+        movq    [ebp+64], mm6
+        movq    [ebp+72], mm7
+
+        mov     eax, 512
+        movd    mm7, eax
+        punpckldq mm7, mm7                  ; mm7 = round (512)
+
+        mov     eax, [ebp+4]
+        movd    mm6, eax
+        punpckldq mm6, mm6                  ; mm6 = delta (0-7)
+
+        mov     eax, 0FFFFh                 ; mask high weights to zero for PMADDWD
+        movd    mm5, eax
+        punpckldq mm5, mm5                  ; mm5 = weight mask 0x0000FFFF0000FFFF
+        pand    mm5, [ebp+8]                ; mm5 = weight_AB masked to 16-bit
+
+        movq    mm4, [ebp+16]               ; preload samples_AB[0]
+
+        mov     al, [ebp]                   ; get term and vector to correct loop
+        cmp     al, 17
+        je      buff_term_17_loop
+        cmp     al, 18
+        je      buff_term_18_loop
+        cmp     al, -1
+        je      buff_term_minus_1_loop
+        cmp     al, -2
+        je      buff_term_minus_2_loop
+        cmp     al, -3
+        je      buff_term_minus_3_loop
+
+        pxor    mm4, mm4                    ; mm4 = 0 (for pcmpeqd)
+        xor     eax, eax
+        xor     ebx, ebx
+        add     bl, [ebp]
+        mov     ecx, 7
+        and     ebx, ecx
+        jmp     buff_default_term_loop
+
+        align  64
+
+buff_default_term_loop:
+        movq    mm2, [edi]                  ; mm2 = left_right
+        movq    mm3, [ebp+16+eax*8]
+        inc     eax
+        and     eax, ecx
+        movq    [ebp+16+ebx*8], mm2
+        inc     ebx
+        and     ebx, ecx
+
+        movq    mm1, mm3
+        paddd   mm1, mm1
+        psrlw   mm1, 1
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        psrld   mm0, 15
+        pmaddwd mm0, mm5
+
+        pslld   mm0, 5
+        paddd   mm1, mm7                    ; add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    ; add shifted sums
+        movq    mm0, mm3
+        movq    [edi], mm2                  ; store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
+        add     edi, 8
+        pcmpeqd mm2, mm4                    ; mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm4                    ; mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    ; mm2 = 1s if either was zero
+        pandn   mm2, mm6                    ; mask delta with zeros check
+        pxor    mm5, mm0
+        paddw   mm5, mm2                    ; and add to weight_AB
+        pxor    mm5, mm0
+        cmp     edi, esi
+        jnz     buff_default_term_loop
+
+        jmp     bdone
+
+        align  64
+
+buff_term_17_loop:
+        movq    mm3, mm4                    ; get previous calculated value
+        paddd   mm3, mm4
+        psubd   mm3, [ebp+24]
+        movq    [ebp+24], mm4
+
+        movq    mm1, mm3
+        paddd   mm1, mm1
+        psrlw   mm1, 1
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        psrld   mm0, 15
+        pmaddwd mm0, mm5
+
+        movq    mm2, [edi]                  ; mm2 = left_right
+        movq    mm4, mm2
+        pslld   mm0, 5
+        paddd   mm1, mm7                    ; add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    ; add shifted sums
+        movq    mm0, mm3
+        movq    [edi], mm2                  ; store result
+        pxor    mm1, mm1
+        pxor    mm0, mm2
+        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
+        add     edi, 8
+        pcmpeqd mm2, mm1                    ; mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    ; mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    ; mm2 = 1s if either was zero
+        pandn   mm2, mm6                    ; mask delta with zeros check
+        pxor    mm5, mm0
+        paddw   mm5, mm2                    ; and add to weight_AB
+        pxor    mm5, mm0
+        cmp     edi, esi
+        jnz     buff_term_17_loop
+
+        movq    [ebp+16], mm4               ; post-store samples_AB[0]
+        jmp     bdone
+
+        align  64
+
+buff_term_18_loop:
+        movq    mm3, mm4                    ; get previous calculated value
+        psubd   mm3, [ebp+24]
+        psrad   mm3, 1
+        paddd   mm3, mm4                    ; mm3 = sam_AB
+        movq    [ebp+24], mm4
+
+        movq    mm1, mm3
+        paddd   mm1, mm1
+        psrlw   mm1, 1
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        psrld   mm0, 15
+        pmaddwd mm0, mm5
+
+        movq    mm2, [edi]                  ; mm2 = left_right
+        movq    mm4, mm2
+        pslld   mm0, 5
+        paddd   mm1, mm7                    ; add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    ; add shifted sums
+        movq    mm0, mm3
+        movq    [edi], mm2                  ; store result
+        pxor    mm1, mm1
+        pxor    mm0, mm2
+        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
+        add     edi, 8
+        pcmpeqd mm2, mm1                    ; mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    ; mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    ; mm2 = 1s if either was zero
+        pandn   mm2, mm6                    ; mask delta with zeros check
+        pxor    mm5, mm0
+        paddw   mm5, mm2                    ; and add to weight_AB
+        pxor    mm5, mm0
+        cmp     edi, esi
+        jnz     buff_term_18_loop
+
+        movq    [ebp+16], mm4               ; post-store samples_AB[0]
+        jmp     bdone
+
+        align  64
+
+buff_term_minus_1_loop:
+        movq    mm3, mm4                    ; mm3 = previous calculated value
+        movq    mm2, [edi]                  ; mm2 = left_right
+        movq    mm4, mm2
+        psrlq   mm4, 32
+        punpckldq mm3, mm2                  ; mm3 = sam_AB
+
+        movq    mm1, mm3
+        paddd   mm1, mm1
+        psrlw   mm1, 1
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        psrld   mm0, 15
+        pmaddwd mm0, mm5
+
+        pslld   mm0, 5
+        paddd   mm1, mm7                    ; add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    ; add shifted sums
+        movq    mm0, mm3
+        movq    [edi], mm2                  ; store result
+        pxor    mm1, mm1
+        pxor    mm0, mm2
+        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
+        add     edi, 8
+        pcmpeqd mm2, mm1                    ; mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    ; mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    ; mm2 = 1s if either was zero
+        pandn   mm2, mm6                    ; mask delta with zeros check
+        pcmpeqd mm1, mm1
+        psubd   mm1, mm7
+        psubd   mm1, mm7
+        psubd   mm1, mm0
+        pxor    mm5, mm0
+        paddw   mm5, mm1
+        paddusw mm5, mm2                    ; and add to weight_AB
+        psubw   mm5, mm1
+        pxor    mm5, mm0
+        cmp     edi, esi
+        jnz     buff_term_minus_1_loop
+
+        movq    [ebp+16], mm4               ; post-store samples_AB[0]
+        jmp     bdone
+
+        align  64
+
+buff_term_minus_2_loop:
+        movq    mm2, [edi]                  ; mm2 = left_right
+        movq    mm3, mm2
+        psrlq   mm3, 32
+        por     mm3, mm4
+        punpckldq mm4, mm2
+
+        movq    mm1, mm3
+        paddd   mm1, mm1
+        psrlw   mm1, 1
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        psrld   mm0, 15
+        pmaddwd mm0, mm5
+
+        pslld   mm0, 5
+        paddd   mm1, mm7                    ; add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    ; add shifted sums
+        movq    mm0, mm3
+        movq    [edi], mm2                  ; store result
+        pxor    mm1, mm1
+        pxor    mm0, mm2
+        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
+        add     edi, 8
+        pcmpeqd mm2, mm1                    ; mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    ; mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    ; mm2 = 1s if either was zero
+        pandn   mm2, mm6                    ; mask delta with zeros check
+        pcmpeqd mm1, mm1
+        psubd   mm1, mm7
+        psubd   mm1, mm7
+        psubd   mm1, mm0
+        pxor    mm5, mm0
+        paddw   mm5, mm1
+        paddusw mm5, mm2                    ; and add to weight_AB
+        psubw   mm5, mm1
+        pxor    mm5, mm0
+        cmp     edi, esi
+        jnz     buff_term_minus_2_loop
+
+        movq    [ebp+16], mm4               ; post-store samples_AB[0]
+        jmp     bdone
+
+        align  64
+
+buff_term_minus_3_loop:
+        movq    mm2, [edi]                  ; mm2 = left_right
+        movq    mm3, mm4                    ; mm3 = previous calculated value
+        movq    mm4, mm2                    ; mm0 = swap dwords of new data
+        psrlq   mm4, 32
+        punpckldq mm4, mm2                  ; mm3 = sam_AB
+
+        movq    mm1, mm3
+        paddd   mm1, mm1
+        psrlw   mm1, 1
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        psrld   mm0, 15
+        pmaddwd mm0, mm5
+
+        pslld   mm0, 5
+        paddd   mm1, mm7                    ; add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    ; add shifted sums
+        movq    mm0, mm3
+        movq    [edi], mm2                  ; store result
+        pxor    mm1, mm1
+        pxor    mm0, mm2
+        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
+        add     edi, 8
+        pcmpeqd mm2, mm1                    ; mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    ; mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    ; mm2 = 1s if either was zero
+        pandn   mm2, mm6                    ; mask delta with zeros check
+        pcmpeqd mm1, mm1
+        psubd   mm1, mm7
+        psubd   mm1, mm7
+        psubd   mm1, mm0
+        pxor    mm5, mm0
+        paddw   mm5, mm1
+        paddusw mm5, mm2                    ; and add to weight_AB
+        psubw   mm5, mm1
+        pxor    mm5, mm0
+        cmp     edi, esi
+        jnz     buff_term_minus_3_loop
+
+        movq    [ebp+16], mm4               ; post-store samples_AB[0]
+
+bdone:  pslld   mm5, 16                     ; sign-extend 16-bit weights back to dwords
+        psrad   mm5, 16
+        movq    [ebp+8], mm5                ; put weight_AB back
+
+        ; convert samples_AB array back into samples_A and samples_B
+
+        movq    mm0, [ebp+16]
+        movq    mm1, [ebp+24]
+        movq    mm2, [ebp+32]
+        movq    mm3, [ebp+40]
+        movq    mm4, [ebp+48]
+        movq    mm5, [ebp+56]
+        movq    mm6, [ebp+64]
+        movq    mm7, [ebp+72]
+
+        movd    DWORD PTR [ebp+16], mm0
+        movd    DWORD PTR [ebp+20], mm1
+        movd    DWORD PTR [ebp+24], mm2
+        movd    DWORD PTR [ebp+28], mm3
+        movd    DWORD PTR [ebp+32], mm4
+        movd    DWORD PTR [ebp+36], mm5
+        movd    DWORD PTR [ebp+40], mm6
+        movd    DWORD PTR [ebp+44], mm7
+
+        punpckhdq   mm0, mm0
+        punpckhdq   mm1, mm1
+        punpckhdq   mm2, mm2
+        punpckhdq   mm3, mm3
+        punpckhdq   mm4, mm4
+        punpckhdq   mm5, mm5
+        punpckhdq   mm6, mm6
+        punpckhdq   mm7, mm7
+
+        movd    DWORD PTR [ebp+48], mm0
+        movd    DWORD PTR [ebp+52], mm1
+        movd    DWORD PTR [ebp+56], mm2
+        movd    DWORD PTR [ebp+60], mm3
+        movd    DWORD PTR [ebp+64], mm4
+        movd    DWORD PTR [ebp+68], mm5
+        movd    DWORD PTR [ebp+72], mm6
+        movd    DWORD PTR [ebp+76], mm7
+
+        emms
+
+        pop     esi
+        pop     edi
+        pop     ebx
+        pop     ebp
+        ret
+
+; These are assembly optimized version of the following WavPack functions:
+;
+; void pack_decorr_stereo_pass_cont (
+;   struct decorr_pass *dpp,
+;   int32_t *in_buffer,
+;   int32_t *out_buffer,
+;   int32_t sample_count);
+;
+; void pack_decorr_stereo_pass_cont_rev (
+;   struct decorr_pass *dpp,
+;   int32_t *in_buffer,
+;   int32_t *out_buffer,
+;   int32_t sample_count);
+;
+; It performs a single pass of stereo decorrelation, transfering from the
+; input buffer to the output buffer. Note that this version of the function
+; requires that the up to 8 previous (depending on dpp->term) stereo samples
+; are visible and correct. In other words, it ignores the "samples_*"
+; fields in the decorr_pass structure and gets the history data directly
+; from the source buffer. It does, however, return the appropriate history
+; samples to the decorr_pass structure before returning.
+;
+; This is written to work on an IA-32 processor and uses the MMX extensions
+; to improve the performance by processing both stereo channels together.
+; It is based on the original MMX code written by Joachim Henke that used
+; MMX intrinsics called from C. Many thanks to Joachim for that!
+;
+; No additional stack space is used; all storage is done in registers. The
+; arguments on entry:
+;
+;   struct decorr_pass *dpp     [ebp+8]
+;   int32_t *in_buffer          [ebp+12]
+;   int32_t *out_buffer         [ebp+16]
+;   int32_t sample_count        [ebp+20]
+;
+; During the processing loops, the following registers are used:
+;
+;   edi         input buffer pointer
+;   esi         direction (-8 forward, +8 reverse)
+;   ebx         delta from input to output buffer
+;   ecx         sample count
+;   edx         sign (dir) * term * -8 (terms 1-8 only)
+;   mm0, mm1    scratch
+;   mm2         original sample values
+;   mm3         correlation samples
+;   mm4         weight sums
+;   mm5         weights
+;   mm6         delta
+;   mm7         512 (for rounding)
+;
+
+_pack_decorr_stereo_pass_cont_rev_x86:
+        push    ebp
+        mov     ebp, esp
+        push    ebx                         ; save the registers that we need to
+        push    esi
+        push    edi
+
+        mov     esi, 8                      ; esi indicates direction (inverted)
+        jmp     start
+
+_pack_decorr_stereo_pass_cont_x86:
+        push    ebp
+        mov     ebp, esp
+        push    ebx                         ; save the registers that we need to
+        push    esi
+        push    edi
+
+        mov     esi, -8                     ; esi indicates direction (inverted)
+
+start:  mov     eax, 512
+        movd    mm7, eax
+        punpckldq mm7, mm7                  ; mm7 = round (512)
+
+        mov     eax, [ebp+8]                ; access dpp
+        mov     eax, [eax+4]
+        movd    mm6, eax
+        punpckldq mm6, mm6                  ; mm6 = delta (0-7)
+
+        mov     eax, [ebp+8]                ; access dpp
+        movq    mm5, [eax+8]                ; mm5 = weight_AB
+        movq    mm4, [eax+88]               ; mm4 = sum_AB
+
+        mov     edi, [ebp+12]               ; edi = in_buffer
+        mov     ebx, [ebp+16]
+        sub     ebx, edi                    ; ebx = delta to output buffer
+
+        mov     ecx, [ebp+20]               ; ecx = sample_count
+        test    ecx, ecx
+        jz      done
+
+        mov     eax, [ebp+8]                ; *eax = dpp
+        mov     eax, [eax]                  ; get term and vector to correct loop
+        cmp     eax, 17
+        je      term_17_loop
+        cmp     eax, 18
+        je      term_18_loop
+        cmp     eax, -1
+        je      term_minus_1_loop
+        cmp     eax, -2
+        je      term_minus_2_loop
+        cmp     eax, -3
+        je      term_minus_3_loop
+
+        sal     eax, 3
+        mov     edx, eax                    ; edx = term * 8 to index correlation sample
+        test    esi, esi                    ; test direction
+        jns     default_term_loop
+        neg     edx
+        jmp     default_term_loop
+
+        align  64
+
+default_term_loop:
+        movq    mm3, [edi+edx]              ; mm3 = sam_AB
+
+        movq    mm1, mm3
+        pslld   mm1, 17
+        psrld   mm1, 17
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        pslld   mm0, 1
+        psrld   mm0, 16
+        pmaddwd mm0, mm5
+
+        movq    mm2, [edi]                  ; mm2 = left_right
+        pslld   mm0, 5
+        paddd   mm1, mm7                    ; add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    ; add shifted sums
+        movq    mm0, mm3
+        movq    [edi+ebx], mm2              ; store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
+        sub     edi, esi
+        pxor    mm1, mm1                    ; mm1 = zero
+        pcmpeqd mm2, mm1                    ; mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    ; mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    ; mm2 = 1s if either was zero
+        pandn   mm2, mm6                    ; mask delta with zeros check
+        pxor    mm5, mm0
+        paddd   mm5, mm2                    ; and add to weight_AB
+        pxor    mm5, mm0
+        paddd   mm4, mm5                    ; add weights to sum
+        dec     ecx
+        jnz     default_term_loop
+
+        mov     eax, [ebp+8]                ; access dpp
+        movq    [eax+8], mm5                ; put weight_AB back
+        movq    [eax+88], mm4               ; put sum_AB back
+        emms
+
+        mov     edx, [ebp+8]                ; access dpp with edx
+        mov     ecx, [edx]                  ; ecx = dpp->term
+
+default_store_samples:
+        dec     ecx
+        add     edi, esi                    ; back up one full sample
+        mov     eax, [edi+4]
+        mov     [edx+ecx*4+48], eax         ; store samples_B [ecx]
+        mov     eax, [edi]
+        mov     [edx+ecx*4+16], eax         ; store samples_A [ecx]
+        test    ecx, ecx
+        jnz     default_store_samples
+        jmp     done
+
+        align  64
+
+term_17_loop:
+        movq    mm3, [edi+esi]              ; get previous calculated value
+        paddd   mm3, mm3
+        psubd   mm3, [edi+esi*2]
+
+        movq    mm1, mm3
+        pslld   mm1, 17
+        psrld   mm1, 17
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        pslld   mm0, 1
+        psrld   mm0, 16
+        pmaddwd mm0, mm5
+
+        movq    mm2, [edi]                  ; mm2 = left_right
+        pslld   mm0, 5
+        paddd   mm1, mm7                    ; add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    ; add shifted sums
+        movq    mm0, mm3
+        movq    [edi+ebx], mm2              ; store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
+        sub     edi, esi
+        pxor    mm1, mm1                    ; mm1 = zero
+        pcmpeqd mm2, mm1                    ; mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    ; mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    ; mm2 = 1s if either was zero
+        pandn   mm2, mm6                    ; mask delta with zeros check
+        pxor    mm5, mm0
+        paddd   mm5, mm2                    ; and add to weight_AB
+        pxor    mm5, mm0
+        paddd   mm4, mm5                    ; add weights to sum
+        dec     ecx
+        jnz     term_17_loop
+
+        mov     eax, [ebp+8]                ; access dpp
+        movq    [eax+8], mm5                ; put weight_AB back
+        movq    [eax+88], mm4               ; put sum_AB back
+        emms
+        jmp     term_1718_common_store
+
+        align  64
+
+term_18_loop:
+        movq    mm3, [edi+esi]              ; get previous calculated value
+        movq    mm0, mm3
+        psubd   mm3, [edi+esi*2]
+        psrad   mm3, 1
+        paddd   mm3, mm0                    ; mm3 = sam_AB
+
+        movq    mm1, mm3
+        pslld   mm1, 17
+        psrld   mm1, 17
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        pslld   mm0, 1
+        psrld   mm0, 16
+        pmaddwd mm0, mm5
+
+        movq    mm2, [edi]                  ; mm2 = left_right
+        pslld   mm0, 5
+        paddd   mm1, mm7                    ; add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    ; add shifted sums
+        movq    mm0, mm3
+        movq    [edi+ebx], mm2              ; store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
+        sub     edi, esi
+        pxor    mm1, mm1                    ; mm1 = zero
+        pcmpeqd mm2, mm1                    ; mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    ; mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    ; mm2 = 1s if either was zero
+        pandn   mm2, mm6                    ; mask delta with zeros check
+        pxor    mm5, mm0
+        paddd   mm5, mm2                    ; and add to weight_AB
+        pxor    mm5, mm0
+        dec     ecx
+        paddd   mm4, mm5                    ; add weights to sum
+        jnz     term_18_loop
+
+        mov     eax, [ebp+8]                ; access dpp
+        movq    [eax+8], mm5                ; put weight_AB back
+        movq    [eax+88], mm4               ; put sum_AB back
+        emms
+
+term_1718_common_store:
+
+        mov     eax, [ebp+8]                ; access dpp
+        add     edi, esi                    ; back up a full sample
+        mov     edx, [edi+4]                ; dpp->samples_B [0] = iptr [-1];
+        mov     [eax+48], edx
+        mov     edx, [edi]                  ; dpp->samples_A [0] = iptr [-2];
+        mov     [eax+16], edx
+        add     edi, esi                    ; back up another sample
+        mov     edx, [edi+4]                ; dpp->samples_B [1] = iptr [-3];
+        mov     [eax+52], edx
+        mov     edx, [edi]                  ; dpp->samples_A [1] = iptr [-4];
+        mov     [eax+20], edx
+        jmp     done
+
+        align  64
+
+term_minus_1_loop:
+        movq    mm3, [edi+esi]              ; mm3 = previous calculated value
+        movq    mm2, [edi]                  ; mm2 = left_right
+        psrlq   mm3, 32
+        punpckldq mm3, mm2                  ; mm3 = sam_AB
+
+        movq    mm1, mm3
+        pslld   mm1, 17
+        psrld   mm1, 17
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        pslld   mm0, 1
+        psrld   mm0, 16
+        pmaddwd mm0, mm5
+
+        pslld   mm0, 5
+        paddd   mm1, mm7                    ; add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    ; add shifted sums
+        movq    mm0, mm3
+        movq    [edi+ebx], mm2              ; store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
+        sub     edi, esi
+        pxor    mm1, mm1                    ; mm1 = zero
+        pcmpeqd mm2, mm1                    ; mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    ; mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    ; mm2 = 1s if either was zero
+        pandn   mm2, mm6                    ; mask delta with zeros check
+        pcmpeqd mm1, mm1
+        psubd   mm1, mm7
+        psubd   mm1, mm7
+        psubd   mm1, mm0
+        pxor    mm5, mm0
+        paddd   mm5, mm1
+        paddusw mm5, mm2                    ; and add to weight_AB
+        psubd   mm5, mm1
+        pxor    mm5, mm0
+        paddd   mm4, mm5                    ; add weights to sum
+        dec     ecx
+        jnz     term_minus_1_loop
+
+        mov     eax, [ebp+8]                ; access dpp
+        movq    [eax+8], mm5                ; put weight_AB back
+        movq    [eax+88], mm4               ; put sum_AB back
+        emms
+
+        add     edi, esi                    ; back up a full sample
+        mov     edx, [edi+4]                ; dpp->samples_A [0] = iptr [-1];
+        mov     eax, [ebp+8]
+        mov     [eax+16], edx
+        jmp     done
+
+        align  64
+
+term_minus_2_loop:
+        movq    mm2, [edi]                  ; mm2 = left_right
+        movq    mm3, mm2                    ; mm3 = swap dwords
+        psrlq   mm3, 32
+        punpckldq mm3, [edi+esi]            ; mm3 = sam_AB
+
+        movq    mm1, mm3
+        pslld   mm1, 17
+        psrld   mm1, 17
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        pslld   mm0, 1
+        psrld   mm0, 16
+        pmaddwd mm0, mm5
+
+        pslld   mm0, 5
+        paddd   mm1, mm7                    ; add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    ; add shifted sums
+        movq    mm0, mm3
+        movq    [edi+ebx], mm2              ; store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
+        sub     edi, esi
+        pxor    mm1, mm1                    ; mm1 = zero
+        pcmpeqd mm2, mm1                    ; mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    ; mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    ; mm2 = 1s if either was zero
+        pandn   mm2, mm6                    ; mask delta with zeros check
+        pcmpeqd mm1, mm1
+        psubd   mm1, mm7
+        psubd   mm1, mm7
+        psubd   mm1, mm0
+        pxor    mm5, mm0
+        paddd   mm5, mm1
+        paddusw mm5, mm2                    ; and add to weight_AB
+        psubd   mm5, mm1
+        pxor    mm5, mm0
+        paddd   mm4, mm5                    ; add weights to sum
+        dec     ecx
+        jnz     term_minus_2_loop
+
+        mov     eax, [ebp+8]                ; access dpp
+        movq    [eax+8], mm5                ; put weight_AB back
+        movq    [eax+88], mm4               ; put sum_AB back
+        emms
+
+        add     edi, esi                    ; back up a full sample
+        mov     edx, [edi]                  ; dpp->samples_B [0] = iptr [-2];
+        mov     eax, [ebp+8]
+        mov     [eax+48], edx
+        jmp     done
+
+        align  64
+
+term_minus_3_loop:
+        movq    mm0, [edi+esi]              ; mm0 = previous calculated value
+        movq    mm3, mm0                    ; mm3 = swap dwords
+        psrlq   mm3, 32
+        punpckldq mm3, mm0                  ; mm3 = sam_AB
+
+        movq    mm1, mm3
+        pslld   mm1, 17
+        psrld   mm1, 17
+        pmaddwd mm1, mm5
+
+        movq    mm0, mm3
+        pslld   mm0, 1
+        psrld   mm0, 16
+        pmaddwd mm0, mm5
+
+        movq    mm2, [edi]                  ; mm2 = left_right
+        pslld   mm0, 5
+        paddd   mm1, mm7                    ; add 512 for rounding
+        psrad   mm1, 10
+        psubd   mm2, mm0
+        psubd   mm2, mm1                    ; add shifted sums
+        movq    mm0, mm3
+        movq    [edi+ebx], mm2              ; store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
+        sub     edi, esi
+        pxor    mm1, mm1                    ; mm1 = zero
+        pcmpeqd mm2, mm1                    ; mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    ; mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    ; mm2 = 1s if either was zero
+        pandn   mm2, mm6                    ; mask delta with zeros check
+        pcmpeqd mm1, mm1
+        psubd   mm1, mm7
+        psubd   mm1, mm7
+        psubd   mm1, mm0
+        pxor    mm5, mm0
+        paddd   mm5, mm1
+        paddusw mm5, mm2                    ; and add to weight_AB
+        psubd   mm5, mm1
+        pxor    mm5, mm0
+        paddd   mm4, mm5                    ; add weights to sum
+        dec     ecx
+        jnz     term_minus_3_loop
+
+        mov     eax, [ebp+8]                ; access dpp
+        movq    [eax+8], mm5                ; put weight_AB back
+        movq    [eax+88], mm4               ; put sum_AB back
+        emms
+
+        add     edi, esi                    ; back up a full sample
+        mov     edx, [edi+4]                ; dpp->samples_A [0] = iptr [-1];
+        mov     eax, [ebp+8]
+        mov     [eax+16], edx
+        mov     edx, [edi]                  ; dpp->samples_B [0] = iptr [-2];
+        mov     [eax+48], edx
+
+done:   pop     edi
+        pop     esi
+        pop     ebx
+        leave
+        ret
+
+
+; This is an assembly optimized version of the following WavPack function:
+;
+; uint32_t decorr_mono_buffer (int32_t *buffer,
+;                              struct decorr_pass *decorr_passes,
+;                              int32_t num_terms,
+;                              int32_t sample_count)
+;
+; Decorrelate a buffer of mono samples, in place, as specified by the array
+; of decorr_pass structures. Note that this function does NOT return the
+; dpp->samples_X[] values in the "normalized" positions for terms 1-8, so if
+; the number of samples is not a multiple of MAX_TERM, these must be moved if
+; they are to be used somewhere else. The magnitude of the output samples is
+; accumulated and returned (see scan_max_magnitude() for more details). By
+; using the overflow detection of the multiply instruction, this detects
+; when the "long_math" varient is required.
+;
+; For the fastest possible operation with the four "common" decorrelation
+; filters (i.e, fast, normal, high and very high) this function can be
+; configured to include hardcoded versions of these filters that are created
+; using macros. In that case, the passed filter is checked to make sure that
+; it matches one of the four. If it doesn't, or if the hardcoded flters are
+; not enabled, a "general" version of the decorrelation loop is used. This
+; variable enables the hardcoded filters and can be disabled if there are
+; problems with the code or macros:
+
+        HARDCODED_FILTERS = 1
+
+; This is written to work on an IA-32 processor. The arguments are on the
+; stack at these locations (after 6 pushes, we do not use ebp as a base
+; pointer):
+;
+;   int32_t *buffer             [esp+28]
+;   struct decorr_pass *dpp     [esp+32]
+;   int32_t num_terms           [esp+36]
+;   int32_t sample_count        [esp+40]
+;
+; register usage:
+;
+; ecx = sample being decorrelated
+; esi = sample up counter
+; edi = *buffer
+; ebp = *dpp
+;
+; stack usage:
+;
+; [esp+0] = dpp end ptr (unused in hardcoded filter case)
+; [esp+4] = magnitude accumulator
+;
+        if      HARDCODED_FILTERS
+;
+; This macro is used for checking the decorr_passes array to make sure that the terms match
+; the hardcoded terms. The terms of these filters are the first element in the tables defined
+; in decorr_tables.h (with the negative terms replaced with 1).
+;
+
+chkterm macro   term, ebp_offset
+        cmp     BYTE PTR [ebp], term
+        jnz     use_general_version
+        add     ebp, ebp_offset
+        endm
+
+;
+; This macro processes the single specified term (with a fixed delta of 2) and updates the
+; term pointer (rbp) with the specified offset when done. It assumes the following registers:
+;
+; ecx = sample being decorrelated
+; esi = sample up counter (used for terms 1-8)
+; rbp = decorr_pass pointer for this term (updated with "rbp_offset" when done)
+; rax, rbx, rdx = scratch
+;
+
+exeterm macro   term, ebp_offset
+        local   over, cont, done
+
+        if      term le 8
+        mov     eax, esi
+        and     eax, 7
+        mov     ebx, [ebp+16+eax*4]
+        if      term ne 8
+        add     eax, term
+        and     eax, 7
+        endif
+        mov     [ebp+16+eax*4], ecx
+
+        elseif  term eq 17
+
+        mov     edx, [ebp+16]               ; handle term 17
+        mov     [ebp+16], ecx
+        lea     ebx, [edx+edx]
+        sub     ebx, [ebp+20]
+        mov     [ebp+20], edx
+
+        else
+
+        mov     edx, [ebp+16]               ; handle term 18
+        mov     [ebp+16], ecx
+        lea     ebx, [edx+edx*2]
+        sub     ebx, [ebp+20]
+        sar     ebx, 1
+        mov     [ebp+20], edx
+
+        endif
+
+        mov     eax, [ebp+8]
+        imul    eax, ebx                    ; 32-bit multiply is almost always enough
+        jo      over                        ; but handle overflow if it happens
+        sar     eax, 10
+        sbb     ecx, eax                    ; borrow flag provides rounding
+        jmp     cont
+over:   mov     eax, [ebp+8]                ; perform 64-bit multiply on overflow
+        imul    ebx
+        shr     eax, 10
+        sbb     ecx, eax
+        shl     edx, 22
+        sub     ecx, edx
+cont:   je      done
+        test    ebx, ebx
+        je      done
+        xor     ebx, ecx
+        sar     ebx, 30
+        or      ebx, 1                      ; this generates delta of 1
+        sal     ebx, 1                      ; this generates delta of 2
+        add     [ebp+8], ebx
+done:   add     ebp, ebp_offset
+
+        endm
+
+        endif                               ; end of macro definitions
+
+; entry point of function
+
+_pack_decorr_mono_buffer_x86:
+        push    ebp                         ; save the resgister that we need to
+        push    ebx
+        push    esi
+        push    edi
+        xor     eax, eax
+        push    eax                         ; this is magnitude accumulator
+        push    eax                         ; this will be dpp end ptr
+
+        mov     edi, [esp+28]               ; edi is buffer pointer
+        xor     esi, esi                    ; up counter = 0
+
+        cmp     DWORD PTR [esp+40], 0       ; test & handle zero sample count & zero term count
+        jz      mexit
+        cmp     DWORD PTR [esp+36], 0
+        jz      mexit
+
+        if      HARDCODED_FILTERS
+
+; first check to make sure all the "deltas" are 2
+
+        mov     ebp, [esp+32]               ; ebp is decorr_pass pointer
+        mov     ebx, [esp+36]               ; get term count
+deltas: cmp     BYTE PTR [ebp+4], 2         ; make sure all the deltas are 2
+        jnz     use_general_version         ; if any aren't, use general case
+        add     ebp, 96
+        dec     ebx
+        jnz     deltas
+
+        mov     ebp, [esp+32]               ; ebp is decorr_pass pointer
+        mov     edx, [esp+36]               ; get term count
+        cmp     dl, 2                       ; 2 terms is "fast"
+        jnz     nfast
+        chkterm 18,  96                     ; check "fast" terms
+        chkterm 17, -96
+        jmp     mono_fast_loop
+
+nfast:  cmp     dl, 5                       ; 5 terms is "normal"
+        jnz     nnorm
+        chkterm 18, 96                      ; check "normal" terms
+        chkterm 18, 96
+        chkterm 2,  96
+        chkterm 17, 96
+        chkterm 3,  96*-4
+        jmp     mono_normal_loop
+
+nnorm:  cmp     dl, 10                      ; 10 terms is "high"
+        jnz     nhigh
+        chkterm 18, 96                      ; check "high" terms
+        chkterm 18, 96
+        chkterm 18, 96
+        chkterm 1,  96
+        chkterm 2,  96
+        chkterm 3,  96
+        chkterm 5,  96
+        chkterm 1,  96
+        chkterm 17, 96
+        chkterm 4,  96*-9
+        jmp     mono_high_loop
+
+nhigh:  cmp     dl, 16                      ; 16 terms is "very high"
+        jnz     use_general_version         ; if none of these, use general version
+        chkterm 18, 96                      ; else check "very high" terms
+        chkterm 18, 96
+        chkterm 2,  96
+        chkterm 3,  96
+        chkterm 1,  96
+        chkterm 18, 96
+        chkterm 2,  96
+        chkterm 4,  96
+        chkterm 7,  96
+        chkterm 5,  96
+        chkterm 3,  96
+        chkterm 6,  96
+        chkterm 8,  96
+        chkterm 1,  96
+        chkterm 18, 96
+        chkterm 2,  96*-15
+        jmp     mono_vhigh_loop
+
+        align   64
+
+mono_fast_loop:
+        mov     ecx, [edi+esi*4]            ; ecx is the sample we're decorrelating
+
+        exeterm 18,  96
+        exeterm 17, -96
+
+        mov     [edi+esi*4], ecx            ; store completed sample
+        mov     eax, ecx                    ; magnitude accumulator |= (sample < 0) ? ~sample : sample
+        cdq
+        xor     eax, edx
+        or      [esp+4], eax
+        inc     esi                         ; increment sample index
+        cmp     esi, [esp+40]
+        jnz     mono_fast_loop              ; loop back for all samples
+        jmp     mexit
+
+        align   64
+
+mono_normal_loop:
+        mov     ecx, [edi+esi*4]            ; ecx is the sample we're decorrelating
+
+        exeterm 18, 96
+        exeterm 18, 96
+        exeterm 2,  96
+        exeterm 17, 96
+        exeterm 3,  96*-4
+
+        mov     [edi+esi*4], ecx            ; store completed sample
+        mov     eax, ecx                    ; magnitude accumulator |= (sample < 0) ? ~sample : sample
+        cdq
+        xor     eax, edx
+        or      [esp+4], eax
+        inc     esi                         ; increment sample index
+        cmp     esi, [esp+40]
+        jnz     mono_normal_loop            ; loop back for all samples
+        jmp     mexit
+
+        align   64
+
+mono_high_loop:
+        mov     ecx, [edi+esi*4]             ; ecx is the sample we're decorrelating
+
+        exeterm 18, 96
+        exeterm 18, 96
+        exeterm 18, 96
+        exeterm 1,  96
+        exeterm 2,  96
+        exeterm 3,  96
+        exeterm 5,  96
+        exeterm 1,  96
+        exeterm 17, 96
+        exeterm 4,  96*-9
+
+        mov     [edi+esi*4], ecx            ; store completed sample
+        mov     eax, ecx                    ; magnitude accumulator |= (sample < 0) ? ~sample : sample
+        cdq
+        xor     eax, edx
+        or      [esp+4], eax
+        inc     esi                         ; increment sample index
+        cmp     esi, [esp+40]
+        jnz     mono_high_loop              ; loop back for all samples
+        jmp     mexit
+
+        align   64
+
+mono_vhigh_loop:
+        mov     ecx, [edi+esi*4]             ; ecx is the sample we're decorrelating
+
+        exeterm 18, 96
+        exeterm 18, 96
+        exeterm 2,  96
+        exeterm 3,  96
+        exeterm 1,  96
+        exeterm 18, 96
+        exeterm 2,  96
+        exeterm 4,  96
+        exeterm 7,  96
+        exeterm 5,  96
+        exeterm 3,  96
+        exeterm 6,  96
+        exeterm 8,  96
+        exeterm 1,  96
+        exeterm 18, 96
+        exeterm 2,  96*-15
+
+        mov     [edi+esi*4], ecx            ; store completed sample
+        mov     eax, ecx                    ; magnitude accumulator |= (sample < 0) ? ~sample : sample
+        cdq
+        xor     eax, edx
+        or      [esp+4], eax
+        inc     esi                         ; increment sample index
+        cmp     esi, [esp+40]
+        jnz     mono_vhigh_loop             ; loop back for all samples
+        jmp     mexit
+
+        endif                               ; end of HARDCODED_FILTERS
+
+use_general_version:
+        mov     ebp, [esp+32]
+        mov     edx, [esp+36]               ; get number of terms
+        imul    eax, edx, 96                ; calculate & store termination check ptr
+        add     eax, [esp+32]
+        mov     [esp], eax
+        jmp     decorrelate_loop
+
+        align   64
+
+decorrelate_loop:
+        mov     ecx, [edi+esi*4]             ; ecx is the sample we're decorrelating
+nxterm: mov     edx, [ebp]
+        cmp     dl, 17
+        jge     @f
+
+        mov     eax, esi
+        and     eax, 7
+        mov     ebx, [ebp+16+eax*4]
+        add     eax, edx
+        and     eax, 7
+        mov     [ebp+16+eax*4], ecx
+        jmp     domult
+
+        align   4
+@@:     mov     edx, [ebp+16]
+        mov     [ebp+16], ecx
+        je      @f
+        lea     ebx, [edx+edx*2]
+        sub     ebx, [ebp+20]
+        sar     ebx, 1
+        mov     [ebp+20], edx
+        jmp     domult
+
+        align   4
+@@:     lea     ebx, [edx+edx]
+        sub     ebx, [ebp+20]
+        mov     [ebp+20], edx
+
+domult: mov     eax, [ebp+8]
+        mov     edx, eax
+        imul    eax, ebx
+        jo      multov                      ; on overflow, jump to use 64-bit imul varient
+        sar     eax, 10
+        sbb     ecx, eax
+        je      @f
+        test    ebx, ebx
+        je      @f
+        xor     ebx, ecx
+        sar     ebx, 31
+        xor     edx, ebx
+        add     edx, [ebp+4]
+        xor     edx, ebx
+        mov     [ebp+8], edx
+@@:     add     ebp, 96
+        cmp     ebp, [esp]
+        jnz     nxterm
+
+        mov     [edi+esi*4], ecx            ; store completed sample
+        mov     eax, ecx                    ; magnitude accumulator |= (sample < 0) ? ~sample : sample
+        cdq
+        xor     eax, edx
+        or      [esp+4], eax
+        mov     ebp, [esp+32]               ; reload decorr_passes pointer to first term
+        inc     esi                         ; increment sample index
+        cmp     esi, [esp+40]
+        jnz     decorrelate_loop
+        jmp     mexit
+
+        align   4
+multov: mov     eax, [ebp+8]
+        imul    ebx
+        shr     eax, 10
+        sbb     ecx, eax
+        shl     edx, 22
+        sub     ecx, edx
+        je      @f
+        test    ebx, ebx
+        je      @f
+        xor     ebx, ecx
+        sar     ebx, 31
+        mov     eax, [ebp+8]
+        xor     eax, ebx
+        add     eax, [ebp+4]
+        xor     eax, ebx
+        mov     [ebp+8], eax
+@@:     add     ebp, 96
+        cmp     ebp, [esp]
+        jnz     nxterm
+
+        mov     [edi+esi*4], ecx            ; store completed sample
+        mov     eax, ecx                    ; magnitude accumulator |= (sample < 0) ? ~sample : sample
+        cdq
+        xor     eax, edx
+        or      [esp+4], eax
+        mov     ebp, [esp+32]               ; reload decorr_passes pointer to first term
+        inc     esi                         ; increment sample index
+        cmp     esi, [esp+40]
+        jnz     decorrelate_loop            ; loop all the way back this time
+
+mexit:  pop     eax
+        pop     eax                         ; pop magnitude accumulator
+        pop     edi
+        pop     esi
+        pop     ebx
+        pop     ebp
+        ret
+
+
+; This is an assembly optimized version of the following WavPack function:
+;
+; void decorr_mono_pass_cont (int32_t *out_buffer,
+;                             int32_t *in_buffer,
+;                             struct decorr_pass *dpp,
+;                             int32_t sample_count);
+;
+; It performs a single pass of mono decorrelation, transfering from the
+; input buffer to the output buffer. Note that this version of the function
+; requires that the up to 8 previous (depending on dpp->term) mono samples
+; are visible and correct. In other words, it ignores the "samples_*"
+; fields in the decorr_pass structure and gets the history data directly
+; from the source buffer. It does, however, return the appropriate history
+; samples to the decorr_pass structure before returning.
+;
+; By using the overflow detection of the multiply instruction, it detects
+; when the "long_math" varient is required and automatically does it.
+;
+; This is written to work on an IA-32 processor. The arguments on entry:
+;
+;   int32_t *out_buffer         [ebp+8]
+;   int32_t *in_buffer          [ebp+12]
+;   struct decorr_pass *dpp     [ebp+16]
+;   int32_t sample_count        [ebp+20]
+;
+; Register / stack usage:
+;
+; esi = source ptr
+; edi = destination ptr
+; ecx = term * -4 (default terms)
+; ecx = previous sample (terms 17 & 18)
+; ebp = weight
+; [esp] = delta
+; [esp+4] = weight sum
+; [esp+8] = eptr
+;
+
+_pack_decorr_mono_pass_cont_x86:
+        push    ebp
+        mov     ebp, esp
+        push    ebx                         ; save the registers that we need to
+        push    esi
+        push    edi
+        cld
+
+        mov     esi, [ebp+12]
+        mov     edi, [ebp+8]
+        mov     edx, [ebp+16]               ; edx = *dpp
+        mov     ecx, [ebp+20]               ; ecx = sample count
+        mov     ebp, [edx+8]                ; ebp = weight
+        lea     eax, [esi+ecx*4]            ; calc & push eptr (access with [esp+8])
+        push    eax
+        mov     eax, [edx+88]               ; push dpp->sum_A (access with [esp+4])
+        push    eax
+        mov     eax, [edx+4]                ; push delta (access with [esp])
+        push    eax
+        test    ecx, ecx                    ; test for and handle zero count
+        jz      mono_done
+
+        cld                                 ; we use lodsd/stosd
+        mov     ecx, [esi-4]                ; preload last sample
+        mov     eax, [edx]                  ; get term & branch for terms 17 & 18
+        cmp     eax, 17
+        je      mono_term_17_loop
+        cmp     eax, 18
+        je      mono_term_18_loop
+
+        imul    ecx, eax, -4                ; ecx is index to correlation sample now
+        jmp     mono_default_term_loop
+
+        align  64
+
+mono_default_term_loop:
+        mov     edx, [esi+ecx]
+        mov     ebx, edx
+        imul    edx, ebp
+        jo      over
+        lodsd
+        sar     edx, 10
+        sbb     eax, edx
+        jmp     @f
+over:   mov     eax, ebx
+        imul    ebp
+        shl     edx, 22
+        shr     eax, 10
+        adc     edx, eax                    ; edx = apply_weight (sam_A)
+        lodsd
+        sub     eax, edx
+@@:     stosd
+        je      @f
+        test    ebx, ebx
+        je      @f
+        xor     eax, ebx
+        cdq
+        xor     ebp, edx
+        add     ebp, [esp]
+        xor     ebp, edx
+@@:     add     [esp+4], ebp
+        cmp     esi, [esp+8]
+        jnz     mono_default_term_loop
+
+        mov     ecx, ebp                    ; ecx = weight
+        mov     eax, [esp+4]                ; eax = weight sum
+        lea     ebp, [esp+24]               ; restore ebp (we've pushed 6 DWORDS)
+        mov     edx, [ebp+16]               ; edx = *dpp
+        mov     [edx+8], ecx                ; put weight back
+        mov     [edx+88], eax               ; put dpp->sum_A back
+        mov     ecx, [edx]                  ; ecx = dpp->term
+
+mono_default_store_samples:
+        dec     ecx
+        sub     esi, 4                      ; back up one sample
+        mov     eax, [esi]
+        mov     [edx+ecx*4+16], eax         ; store samples_A [ecx]
+        test    ecx, ecx
+        jnz     mono_default_store_samples
+        jmp     mono_done
+
+        align  64
+
+mono_term_17_loop:
+        lea     edx, [ecx+ecx]
+        sub     edx, [esi-8]                ; ebx = sam_A
+        mov     ebx, edx
+        imul    edx, ebp
+        jo      over17
+        sar     edx, 10
+        lodsd
+        mov     ecx, eax
+        sbb     eax, edx
+        jmp     @f
+over17: mov     eax, ebx
+        imul    ebp
+        shl     edx, 22
+        shr     eax, 10
+        adc     edx, eax                    ; edx = apply_weight (sam_A)
+        lodsd
+        mov     ecx, eax
+        sub     eax, edx
+@@:     stosd
+        je      @f
+        test    ebx, ebx
+        je      @f
+        xor     eax, ebx
+        cdq
+        xor     ebp, edx
+        add     ebp, [esp]
+        xor     ebp, edx
+@@:     add     [esp+4], ebp
+        cmp     esi, [esp+8]
+        jnz     mono_term_17_loop
+        jmp     mono_term_1718_exit
+
+        align  64
+
+mono_term_18_loop:
+        lea     edx, [ecx+ecx*2]
+        sub     edx, [esi-8]
+        sar     edx, 1
+        mov     ebx, edx                    ; ebx = sam_A
+        imul    edx, ebp
+        jo      over18
+        sar     edx, 10
+        lodsd
+        mov     ecx, eax
+        sbb     eax, edx
+        jmp     @f
+over18: mov     eax, ebx
+        imul    ebp
+        shl     edx, 22
+        shr     eax, 10
+        adc     edx, eax                    ; edx = apply_weight (sam_A)
+        lodsd
+        mov     ecx, eax
+        sub     eax, edx
+@@:     stosd
+        je      @f
+        test    ebx, ebx
+        je      @f
+        xor     eax, ebx
+        cdq
+        xor     ebp, edx
+        add     ebp, [esp]
+        xor     ebp, edx
+@@:     add     [esp+4], ebp
+        cmp     esi, [esp+8]
+        jnz     mono_term_18_loop
+
+mono_term_1718_exit:
+        mov     ecx, ebp                    ; ecx = weight
+        mov     eax, [esp+4]                ; eax = weight sum
+        lea     ebp, [esp+24]               ; restore ebp (we've pushed 6 DWORDS)
+        mov     edx, [ebp+16]               ; edx = *dpp
+        mov     [edx+8], ecx                ; put weight back
+        mov     [edx+88], eax               ; put dpp->sum_A back
+        mov     eax, [esi-4]                ; dpp->samples_A [0] = bptr [-1]
+        mov     [edx+16], eax
+        mov     eax, [esi-8]                ; dpp->samples_A [1] = bptr [-2]
+        mov     [edx+20], eax
+
+mono_done:
+        add     esp, 12                     ; deallocate stack space
+        pop     edi                         ; pop saved registers & return
+        pop     esi
+        pop     ebx
+        pop     ebp
+        ret
+
+
+; This is an assembly optimized version of the following WavPack function:
+;
+; uint32_t scan_max_magnitude (int32_t *buffer, int32_t sample_count);
+;
+; This function scans a buffer of signed 32-bit ints and returns the magnitude
+; of the largest sample, with a power-of-two resolution. It might be more
+; useful to return the actual maximum absolute value, but that implementation
+; would be slower. Instead, this simply returns the "or" of all the values
+; "xor"d with their own sign, like so:
+;
+;     while (sample_count--)
+;         magnitude |= (*buffer < 0) ? ~*buffer++ : *buffer++;
+;
+; This is written to work on an IA-32 processor and uses the MMX extensions
+; to improve the performance by processing two samples together. The arguments
+; are on the stack at these locations (after 4 pushes, we do not use ebp as a
+; base pointer):
+;
+;   int32_t *buffer             [esp+20]
+;   uint32_t sample_count       [esp+24]
+;
+; During the processing loops, the following registers are used:
+;
+;   edi         buffer pointer
+;   esi         termination buffer pointer
+;   ebx         single magnitude accumulator
+;   mm0         dual magnitude accumulator
+;   mm1, mm2    scratch
+;
+
+_scan_max_magnitude_x86:
+        push    ebp
+        push    ebx
+        push    esi
+        push    edi
+
+        xor     ebx, ebx                    ; clear magnitude accumulator
+        mov     edi, [esp+20]               ; edi = buffer pointer
+
+        mov     eax, [esp+24]               ; eax = count
+        and     eax, 7
+        mov     ecx, eax                    ; ecx = leftover samples to "manually" scan at end
+
+        mov     eax, [esp+24]               ; eax = count
+        shr     eax, 3                      ; eax = num of loops to process mmx (8 samples/loop)
+        shl     eax, 5                      ; eax = num of bytes to process mmx (32 bytes/loop)
+        jz      nommx                       ; jump around if no mmx loops to do (< 8 samples)
+
+        pxor    mm0, mm0                    ; clear dual magnitude accumulator
+        add     eax, edi                    ; esi = termination buffer pointer for mmx loop
+        mov     esi, eax
+        jmp     mmxlp
+
+        align  64
+
+mmxlp:  movq    mm1, [edi]                  ; get stereo samples in mm1 & mm2
+        movq    mm2, mm1
+        psrad   mm1, 31                     ; mm1 = sign (mm2)
+        pxor    mm1, mm2                    ; mm1 = absolute magnitude, or into result
+        por     mm0, mm1
+
+        movq    mm1, [edi+8]                ; do it again with 6 more samples
+        movq    mm2, mm1
+        psrad   mm1, 31
+        pxor    mm1, mm2
+        por     mm0, mm1
+
+        movq    mm1, [edi+16]
+        movq    mm2, mm1
+        psrad   mm1, 31
+        pxor    mm1, mm2
+        por     mm0, mm1
+
+        movq    mm1, [edi+24]
+        movq    mm2, mm1
+        psrad   mm1, 31
+        pxor    mm1, mm2
+        por     mm0, mm1
+
+        add     edi, 32
+        cmp     edi, esi
+        jnz     mmxlp
+
+        movd    eax, mm0                    ; ebx = "or" of high and low mm0
+        punpckhdq mm0, mm0
+        movd    ebx, mm0
+        or      ebx, eax
+        emms
+
+nommx:  and     ecx, ecx                    ; any leftover samples to do?
+        jz      noleft
+
+leftlp: mov     eax, [edi]
+        cdq
+        xor     eax, edx
+        or      ebx, eax
+        add     edi, 4
+        loop    leftlp
+
+noleft: mov     eax, ebx                    ; move magnitude to eax for return
+        pop     edi
+        pop     esi
+        pop     ebx
+        pop     ebp
+        ret
+
+
+; This is an assembly optimized version of the following WavPack function:
+;
+; uint32_t log2buffer (int32_t *samples, uint32_t num_samples, int limit);
+;
+; This function scans a buffer of 32-bit ints and accumulates the total
+; log2 value of all the samples. This is useful for determining maximum
+; compression because the bitstream storage required for entropy coding
+; is proportional to the base 2 log of the samples.
+;
+; This is written to work on an IA-32 processor. The arguments are on the
+; stack at these locations (after 4 pushes, we do not use ebp as a base
+; pointer):
+;
+;   int32_t *samples            [esp+20]
+;   uint32_t num_samples        [esp+24]
+;   int limit                   [esp+28]
+;
+; During the processing loops, the following registers are used:
+;
+;   esi             input buffer pointer
+;   edi             sum accumulator
+;   ebx             sample count
+;   ebp             log2_table pointer
+;   eax,ecx,edx     scratch
+;
+
+        align  256
+        .radix 16
+
+log2_table:
+        byte   000, 001, 003, 004, 006, 007, 009, 00a, 00b, 00d, 00e, 010, 011, 012, 014, 015
+        byte   016, 018, 019, 01a, 01c, 01d, 01e, 020, 021, 022, 024, 025, 026, 028, 029, 02a
+        byte   02c, 02d, 02e, 02f, 031, 032, 033, 034, 036, 037, 038, 039, 03b, 03c, 03d, 03e
+        byte   03f, 041, 042, 043, 044, 045, 047, 048, 049, 04a, 04b, 04d, 04e, 04f, 050, 051
+        byte   052, 054, 055, 056, 057, 058, 059, 05a, 05c, 05d, 05e, 05f, 060, 061, 062, 063
+        byte   064, 066, 067, 068, 069, 06a, 06b, 06c, 06d, 06e, 06f, 070, 071, 072, 074, 075
+        byte   076, 077, 078, 079, 07a, 07b, 07c, 07d, 07e, 07f, 080, 081, 082, 083, 084, 085
+        byte   086, 087, 088, 089, 08a, 08b, 08c, 08d, 08e, 08f, 090, 091, 092, 093, 094, 095
+        byte   096, 097, 098, 099, 09a, 09b, 09b, 09c, 09d, 09e, 09f, 0a0, 0a1, 0a2, 0a3, 0a4
+        byte   0a5, 0a6, 0a7, 0a8, 0a9, 0a9, 0aa, 0ab, 0ac, 0ad, 0ae, 0af, 0b0, 0b1, 0b2, 0b2
+        byte   0b3, 0b4, 0b5, 0b6, 0b7, 0b8, 0b9, 0b9, 0ba, 0bb, 0bc, 0bd, 0be, 0bf, 0c0, 0c0
+        byte   0c1, 0c2, 0c3, 0c4, 0c5, 0c6, 0c6, 0c7, 0c8, 0c9, 0ca, 0cb, 0cb, 0cc, 0cd, 0ce
+        byte   0cf, 0d0, 0d0, 0d1, 0d2, 0d3, 0d4, 0d4, 0d5, 0d6, 0d7, 0d8, 0d8, 0d9, 0da, 0db
+        byte   0dc, 0dc, 0dd, 0de, 0df, 0e0, 0e0, 0e1, 0e2, 0e3, 0e4, 0e4, 0e5, 0e6, 0e7, 0e7
+        byte   0e8, 0e9, 0ea, 0ea, 0eb, 0ec, 0ed, 0ee, 0ee, 0ef, 0f0, 0f1, 0f1, 0f2, 0f3, 0f4
+        byte   0f4, 0f5, 0f6, 0f7, 0f7, 0f8, 0f9, 0f9, 0fa, 0fb, 0fc, 0fc, 0fd, 0fe, 0ff, 0ff
+
+        .radix  10
+
+_log2buffer_x86:
+        push    ebp
+        push    ebx
+        push    esi
+        push    edi
+        cld
+
+        mov     esi, [esp+20]               ; esi = sample source pointer
+        xor     edi, edi                    ; edi = 0 (accumulator)
+        mov     ebx, [esp+24]               ; ebx = num_samples
+        test    ebx, ebx                    ; exit now if none, sum = 0
+        jz      normal_exit
+
+; These three instructions allow this to be PIC (position independent code). The purpose is to
+; load the address of the log2_table into ebp regardless of where this is all loaded in memory.
+
+        call    nexti                       ; push address of nexti (return address)
+nexti:  pop     ebp                         ; pop address of nexti into ebp
+        sub     ebp, nexti - log2_table     ; offset to log2_table
+
+        mov     eax, [esp+28]               ; eax = limit
+        test    eax, eax                    ; we have separate loops for limit and no limit
+        jz      no_limit_loop
+        jmp     limit_loop
+
+        align  64
+
+limit_loop:
+        mov     eax, [esi]                  ; get next sample into eax
+        cdq                                 ; edx = sign of sample (for abs)
+        add     esi, 4
+        xor     eax, edx
+        sub     eax, edx
+        je      L40                         ; skip if sample was zero
+        mov     edx, eax                    ; move to edx and apply rounding
+        shr     eax, 9
+        add     edx, eax
+        bsr     ecx, edx                    ; ecx = MSB set in sample (0 - 31)
+        lea     eax, [ecx+1]                ; eax = number used bits in sample (1 - 32)
+        sub     ecx, 8                      ; ecx = shift right amount (-8 to 23)
+        ror     edx, cl                     ; use rotate to do "signed" shift 
+        sal     eax, 8                      ; move nbits to integer portion of log
+        movzx   edx, dl                     ; dl = mantissa, look up log fraction in table 
+        mov     al, BYTE PTR [ebp+edx]      ; eax = combined integer and fraction for full log
+        add     edi, eax                    ; add to running sum and compare to limit
+        cmp     eax, [esp+28]
+        jge     limit_exceeded
+L40:    sub     ebx, 1                      ; loop back if more samples
+        jne     limit_loop
+        jmp     normal_exit
+
+        align  64
+
+no_limit_loop:
+        mov     eax, [esi]                  ; get next sample into eax
+        cdq                                 ; edx = sign of sample (for abs)
+        add     esi, 4
+        xor     eax, edx
+        sub     eax, edx
+        je      L45                         ; skip if sample was zero
+        mov     edx, eax                    ; move to edx and apply rounding
+        shr     eax, 9
+        add     edx, eax
+        bsr     ecx, edx                    ; ecx = MSB set in sample (0 - 31)
+        lea     eax, [ecx+1]                ; eax = number used bits in sample (1 - 32)
+        sub     ecx, 8                      ; ecx = shift right amount (-8 to 23)
+        ror     edx, cl                     ; use rotate to do "signed" shift 
+        sal     eax, 8                      ; move nbits to integer portion of log
+        movzx   edx, dl                     ; dl = mantissa, look up log fraction in table 
+        mov     al, BYTE PTR [ebp+edx]      ; eax = combined integer and fraction for full log
+        add     edi, eax                    ; add to running sum
+L45:    sub     ebx, 1                      ; loop back if more samples
+        jne     no_limit_loop
+        jmp     normal_exit
+
+limit_exceeded:
+        mov     edi, -1                     ; -1 return means log limit exceeded
+normal_exit:
+        mov     eax, edi                    ; move sum accumulator into eax for return
+        pop     edi
+        pop     esi
+        pop     ebx
+        pop     ebp
+        ret
+
+; Helper function to determine if specified CPU feature is available (used here for MMX).
+; Input parameter is index of feature to be checked (EDX from CPUID(1) only, MMX = 23).
+; Return value is the specified bit (0 or 1) or 0 if CPUID is not supported.
+
+_pack_cpu_has_feature_x86:
+        pushfd                              ; save eflags
+        pushfd                              ; push another copy
+        xor     dword ptr [esp], 200000h    ; toggle ID bit on stack & pop it back into eflags
+        popfd
+        pushfd                              ; store possibly modified eflags
+        pop     eax                         ; and pop back into eax
+        xor     eax, [esp]                  ; compare to original pushed eflags
+        popfd                               ; restore original eflags
+        and     eax, 200000h                ; eax = 1 if eflags ID bit was changable
+        jz      oldcpu                      ; return zero if CPUID is not available (wow!)
+
+        push    ebx                         ; we must save ebx
+        mov     eax, 1                      ; do cpuid (1) to get features into edx
+        cpuid
+        mov     eax, edx                    ; copy into eax for shift
+        mov     cl, [esp+8]                 ; get parameter and shift that bit index into LSB
+        sar     eax, cl
+        and     eax, 1
+        pop     ebx                         ; restore ebx and return 0 or 1
+
+oldcpu: ret                                 ; return value in eax
+
+asmcode ends
+
+        end
+
diff --git a/third_party/wavpack/src/read_words.c b/third_party/wavpack/src/read_words.c
new file mode 100644
index 0000000..bbe2db5
--- /dev/null
+++ b/third_party/wavpack/src/read_words.c
@@ -0,0 +1,614 @@
+////////////////////////////////////////////////////////////////////////////
+//                           **** WAVPACK ****                            //
+//                  Hybrid Lossless Wavefile Compressor                   //
+//              Copyright (c) 1998 - 2013 Conifer Software.               //
+//                          All Rights Reserved.                          //
+//      Distributed under the BSD Software License (see license.txt)      //
+////////////////////////////////////////////////////////////////////////////
+
+// read_words.c
+
+// This module provides entropy word decoding functions using
+// a variation on the Rice method.  This was introduced in version 3.93
+// because it allows splitting the data into a "lossy" stream and a
+// "correction" stream in a very efficient manner and is therefore ideal
+// for the "hybrid" mode.  For 4.0, the efficiency of this method was
+// significantly improved by moving away from the normal Rice restriction of
+// using powers of two for the modulus divisions and now the method can be
+// used for both hybrid and pure lossless encoding.
+
+// Samples are divided by median probabilities at 5/7 (71.43%), 10/49 (20.41%),
+// and 20/343 (5.83%). Each zone has 3.5 times fewer samples than the
+// previous. Using standard Rice coding on this data would result in 1.4
+// bits per sample average (not counting sign bit). However, there is a
+// very simple encoding that is over 99% efficient with this data and
+// results in about 1.22 bits per sample.
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "wavpack_local.h"
+
+#if defined (HAVE___BUILTIN_CTZ) || defined (_WIN64)
+#define USE_CTZ_OPTIMIZATION    // use ctz intrinsic (or Windows equivalent) to count trailing ones
+#else
+#define USE_NEXT8_OPTIMIZATION  // optimization using a table to count trailing ones
+#endif
+
+#define USE_BITMASK_TABLES      // use tables instead of shifting for certain masking operations
+
+///////////////////////////// local table storage ////////////////////////////
+
+#ifdef USE_NEXT8_OPTIMIZATION
+static const char ones_count_table [] = {
+    0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,
+    0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,6,
+    0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,
+    0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,7,
+    0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,
+    0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,6,
+    0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,
+    0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,8
+};
+#endif
+
+///////////////////////////// executable code ////////////////////////////////
+
+static uint32_t __inline read_code (Bitstream *bs, uint32_t maxcode);
+
+// Read the next word from the bitstream "wvbits" and return the value. This
+// function can be used for hybrid or lossless streams, but since an
+// optimized version is available for lossless this function would normally
+// be used for hybrid only. If a hybrid lossless stream is being read then
+// the "correction" offset is written at the specified pointer. A return value
+// of WORD_EOF indicates that the end of the bitstream was reached (all 1s) or
+// some other error occurred.
+
+int32_t FASTCALL get_word (WavpackStream *wps, int chan, int32_t *correction)
+{
+    register struct entropy_data *c = wps->w.c + chan;
+    uint32_t ones_count, low, mid, high;
+    int32_t value;
+    int sign;
+
+    if (!wps->wvbits.ptr)
+        return WORD_EOF;
+
+    if (correction)
+        *correction = 0;
+
+    if (!(wps->w.c [0].median [0] & ~1) && !wps->w.holding_zero && !wps->w.holding_one && !(wps->w.c [1].median [0] & ~1)) {
+        uint32_t mask;
+        int cbits;
+
+        if (wps->w.zeros_acc) {
+            if (--wps->w.zeros_acc) {
+                c->slow_level -= (c->slow_level + SLO) >> SLS;
+                return 0;
+            }
+        }
+        else {
+            for (cbits = 0; cbits < 33 && getbit (&wps->wvbits); ++cbits);
+
+            if (cbits == 33)
+                return WORD_EOF;
+
+            if (cbits < 2)
+                wps->w.zeros_acc = cbits;
+            else {
+                for (mask = 1, wps->w.zeros_acc = 0; --cbits; mask <<= 1)
+                    if (getbit (&wps->wvbits))
+                        wps->w.zeros_acc |= mask;
+
+                wps->w.zeros_acc |= mask;
+            }
+
+            if (wps->w.zeros_acc) {
+                c->slow_level -= (c->slow_level + SLO) >> SLS;
+                CLEAR (wps->w.c [0].median);
+                CLEAR (wps->w.c [1].median);
+                return 0;
+            }
+        }
+    }
+
+    if (wps->w.holding_zero)
+        ones_count = wps->w.holding_zero = 0;
+    else {
+#ifdef USE_CTZ_OPTIMIZATION
+        while (wps->wvbits.bc < LIMIT_ONES) {
+            if (++(wps->wvbits.ptr) == wps->wvbits.end)
+                wps->wvbits.wrap (&wps->wvbits);
+
+            wps->wvbits.sr |= *(wps->wvbits.ptr) << wps->wvbits.bc;
+            wps->wvbits.bc += sizeof (*(wps->wvbits.ptr)) * 8;
+        }
+
+#ifdef _WIN32
+        { unsigned long res; _BitScanForward (&res, (unsigned long)~wps->wvbits.sr); ones_count = (uint32_t) res; }
+#else
+        ones_count = __builtin_ctz (~wps->wvbits.sr);
+#endif
+
+        if (ones_count >= LIMIT_ONES) {
+            wps->wvbits.bc -= ones_count;
+            wps->wvbits.sr >>= ones_count;
+
+            for (; ones_count < (LIMIT_ONES + 1) && getbit (&wps->wvbits); ++ones_count);
+
+            if (ones_count == (LIMIT_ONES + 1))
+                return WORD_EOF;
+
+            if (ones_count == LIMIT_ONES) {
+                uint32_t mask;
+                int cbits;
+
+                for (cbits = 0; cbits < 33 && getbit (&wps->wvbits); ++cbits);
+
+                if (cbits == 33)
+                    return WORD_EOF;
+
+                if (cbits < 2)
+                    ones_count = cbits;
+                else {
+                    for (mask = 1, ones_count = 0; --cbits; mask <<= 1)
+                        if (getbit (&wps->wvbits))
+                            ones_count |= mask;
+
+                    ones_count |= mask;
+                }
+
+                ones_count += LIMIT_ONES;
+            }
+        }
+        else {
+            wps->wvbits.bc -= ones_count + 1;
+            wps->wvbits.sr >>= ones_count + 1;
+        }
+#elif defined (USE_NEXT8_OPTIMIZATION)
+        int next8;
+
+        if (wps->wvbits.bc < 8) {
+            if (++(wps->wvbits.ptr) == wps->wvbits.end)
+                wps->wvbits.wrap (&wps->wvbits);
+
+            next8 = (wps->wvbits.sr |= *(wps->wvbits.ptr) << wps->wvbits.bc) & 0xff;
+            wps->wvbits.bc += sizeof (*(wps->wvbits.ptr)) * 8;
+        }
+        else
+            next8 = wps->wvbits.sr & 0xff;
+
+        if (next8 == 0xff) {
+            wps->wvbits.bc -= 8;
+            wps->wvbits.sr >>= 8;
+
+            for (ones_count = 8; ones_count < (LIMIT_ONES + 1) && getbit (&wps->wvbits); ++ones_count);
+
+            if (ones_count == (LIMIT_ONES + 1))
+                return WORD_EOF;
+
+            if (ones_count == LIMIT_ONES) {
+                uint32_t mask;
+                int cbits;
+
+                for (cbits = 0; cbits < 33 && getbit (&wps->wvbits); ++cbits);
+
+                if (cbits == 33)
+                    return WORD_EOF;
+
+                if (cbits < 2)
+                    ones_count = cbits;
+                else {
+                    for (mask = 1, ones_count = 0; --cbits; mask <<= 1)
+                        if (getbit (&wps->wvbits))
+                            ones_count |= mask;
+
+                    ones_count |= mask;
+                }
+
+                ones_count += LIMIT_ONES;
+            }
+        }
+        else {
+            wps->wvbits.bc -= (ones_count = ones_count_table [next8]) + 1;
+            wps->wvbits.sr >>= ones_count + 1;
+        }
+#else
+        for (ones_count = 0; ones_count < (LIMIT_ONES + 1) && getbit (&wps->wvbits); ++ones_count);
+
+        if (ones_count >= LIMIT_ONES) {
+            uint32_t mask;
+            int cbits;
+
+            if (ones_count == (LIMIT_ONES + 1))
+                return WORD_EOF;
+
+            for (cbits = 0; cbits < 33 && getbit (&wps->wvbits); ++cbits);
+
+            if (cbits == 33)
+                return WORD_EOF;
+
+            if (cbits < 2)
+                ones_count = cbits;
+            else {
+                for (mask = 1, ones_count = 0; --cbits; mask <<= 1)
+                    if (getbit (&wps->wvbits))
+                        ones_count |= mask;
+
+                ones_count |= mask;
+            }
+
+            ones_count += LIMIT_ONES;
+        }
+#endif
+
+        if (wps->w.holding_one) {
+            wps->w.holding_one = ones_count & 1;
+            ones_count = (ones_count >> 1) + 1;
+        }
+        else {
+            wps->w.holding_one = ones_count & 1;
+            ones_count >>= 1;
+        }
+
+        wps->w.holding_zero = ~wps->w.holding_one & 1;
+    }
+
+    if ((wps->wphdr.flags & HYBRID_FLAG) && !chan)
+        update_error_limit (wps);
+
+    if (ones_count == 0) {
+        low = 0;
+        high = GET_MED (0) - 1;
+        DEC_MED0 ();
+    }
+    else {
+        low = GET_MED (0);
+        INC_MED0 ();
+
+        if (ones_count == 1) {
+            high = low + GET_MED (1) - 1;
+            DEC_MED1 ();
+        }
+        else {
+            low += GET_MED (1);
+            INC_MED1 ();
+
+            if (ones_count == 2) {
+                high = low + GET_MED (2) - 1;
+                DEC_MED2 ();
+            }
+            else {
+                low += (ones_count - 2) * GET_MED (2);
+                high = low + GET_MED (2) - 1;
+                INC_MED2 ();
+            }
+        }
+    }
+
+    low &= 0x7fffffff;
+    high &= 0x7fffffff;
+
+    if (low > high)         // make sure high and low make sense
+        high = low;
+
+    mid = (high + low + 1) >> 1;
+
+    if (!c->error_limit)
+        mid = read_code (&wps->wvbits, high - low) + low;
+    else while (high - low > c->error_limit) {
+        if (getbit (&wps->wvbits))
+            mid = (high + (low = mid) + 1) >> 1;
+        else
+            mid = ((high = mid - 1) + low + 1) >> 1;
+    }
+
+    sign = getbit (&wps->wvbits);
+
+    if (bs_is_open (&wps->wvcbits) && c->error_limit) {
+        value = read_code (&wps->wvcbits, high - low) + low;
+
+        if (correction)
+            *correction = sign ? (mid - value) : (value - mid);
+    }
+
+    if (wps->wphdr.flags & HYBRID_BITRATE) {
+        c->slow_level -= (c->slow_level + SLO) >> SLS;
+        c->slow_level += wp_log2 (mid);
+    }
+
+    return sign ? ~mid : mid;
+}
+
+// This is an optimized version of get_word() that is used for lossless only
+// (error_limit == 0). Also, rather than obtaining a single sample, it can be
+// used to obtain an entire buffer of either mono or stereo samples.
+
+int32_t get_words_lossless (WavpackStream *wps, int32_t *buffer, int32_t nsamples)
+{
+    struct entropy_data *c = wps->w.c;
+    uint32_t ones_count, low, high;
+    Bitstream *bs = &wps->wvbits;
+    int32_t csamples;
+#ifdef USE_NEXT8_OPTIMIZATION
+    int32_t next8;
+#endif
+
+    if (nsamples && !bs->ptr) {
+        memset (buffer, 0, (wps->wphdr.flags & MONO_DATA) ? nsamples * 4 : nsamples * 8);
+        return nsamples;
+    }
+
+    if (!(wps->wphdr.flags & MONO_DATA))
+        nsamples *= 2;
+
+    for (csamples = 0; csamples < nsamples; ++csamples) {
+        if (!(wps->wphdr.flags & MONO_DATA))
+            c = wps->w.c + (csamples & 1);
+
+        if (wps->w.holding_zero) {
+            wps->w.holding_zero = 0;
+            low = read_code (bs, GET_MED (0) - 1);
+            DEC_MED0 ();
+            buffer [csamples] = (getbit (bs)) ? ~low : low;
+
+            if (++csamples == nsamples)
+                break;
+
+            if (!(wps->wphdr.flags & MONO_DATA))
+                c = wps->w.c + (csamples & 1);
+        }
+
+        if (wps->w.c [0].median [0] < 2 && !wps->w.holding_one && wps->w.c [1].median [0] < 2) {
+            uint32_t mask;
+            int cbits;
+
+            if (wps->w.zeros_acc) {
+                if (--wps->w.zeros_acc) {
+                    buffer [csamples] = 0;
+                    continue;
+                }
+            }
+            else {
+                for (cbits = 0; cbits < 33 && getbit (bs); ++cbits);
+
+                if (cbits == 33)
+                    break;
+
+                if (cbits < 2)
+                    wps->w.zeros_acc = cbits;
+                else {
+                    for (mask = 1, wps->w.zeros_acc = 0; --cbits; mask <<= 1)
+                        if (getbit (bs))
+                            wps->w.zeros_acc |= mask;
+
+                    wps->w.zeros_acc |= mask;
+                }
+
+                if (wps->w.zeros_acc) {
+                    CLEAR (wps->w.c [0].median);
+                    CLEAR (wps->w.c [1].median);
+                    buffer [csamples] = 0;
+                    continue;
+                }
+            }
+        }
+
+#ifdef USE_CTZ_OPTIMIZATION
+        while (bs->bc < LIMIT_ONES) {
+            if (++(bs->ptr) == bs->end)
+                bs->wrap (bs);
+
+            bs->sr |= *(bs->ptr) << bs->bc;
+            bs->bc += sizeof (*(bs->ptr)) * 8;
+        }
+
+#ifdef _WIN32
+        { unsigned long res; _BitScanForward (&res, (unsigned long)~wps->wvbits.sr); ones_count = (uint32_t) res; }
+#else
+        ones_count = __builtin_ctz (~wps->wvbits.sr);
+#endif
+
+        if (ones_count >= LIMIT_ONES) {
+            bs->bc -= ones_count;
+            bs->sr >>= ones_count;
+
+            for (; ones_count < (LIMIT_ONES + 1) && getbit (bs); ++ones_count);
+
+            if (ones_count == (LIMIT_ONES + 1))
+                break;
+
+            if (ones_count == LIMIT_ONES) {
+                uint32_t mask;
+                int cbits;
+
+                for (cbits = 0; cbits < 33 && getbit (bs); ++cbits);
+
+                if (cbits == 33)
+                    break;
+
+                if (cbits < 2)
+                    ones_count = cbits;
+                else {
+                    for (mask = 1, ones_count = 0; --cbits; mask <<= 1)
+                        if (getbit (bs))
+                            ones_count |= mask;
+
+                    ones_count |= mask;
+                }
+
+                ones_count += LIMIT_ONES;
+            }
+        }
+        else {
+            bs->bc -= ones_count + 1;
+            bs->sr >>= ones_count + 1;
+        }
+#elif defined (USE_NEXT8_OPTIMIZATION)
+        if (bs->bc < 8) {
+            if (++(bs->ptr) == bs->end)
+                bs->wrap (bs);
+
+            next8 = (bs->sr |= *(bs->ptr) << bs->bc) & 0xff;
+            bs->bc += sizeof (*(bs->ptr)) * 8;
+        }
+        else
+            next8 = bs->sr & 0xff;
+
+        if (next8 == 0xff) {
+            bs->bc -= 8;
+            bs->sr >>= 8;
+
+            for (ones_count = 8; ones_count < (LIMIT_ONES + 1) && getbit (bs); ++ones_count);
+
+            if (ones_count == (LIMIT_ONES + 1))
+                break;
+
+            if (ones_count == LIMIT_ONES) {
+                uint32_t mask;
+                int cbits;
+
+                for (cbits = 0; cbits < 33 && getbit (bs); ++cbits);
+
+                if (cbits == 33)
+                    break;
+
+                if (cbits < 2)
+                    ones_count = cbits;
+                else {
+                    for (mask = 1, ones_count = 0; --cbits; mask <<= 1)
+                        if (getbit (bs))
+                            ones_count |= mask;
+
+                    ones_count |= mask;
+                }
+
+                ones_count += LIMIT_ONES;
+            }
+        }
+        else {
+            bs->bc -= (ones_count = ones_count_table [next8]) + 1;
+            bs->sr >>= ones_count + 1;
+        }
+#else
+        for (ones_count = 0; ones_count < (LIMIT_ONES + 1) && getbit (bs); ++ones_count);
+
+        if (ones_count >= LIMIT_ONES) {
+            uint32_t mask;
+            int cbits;
+
+            if (ones_count == (LIMIT_ONES + 1))
+                break;
+
+            for (cbits = 0; cbits < 33 && getbit (bs); ++cbits);
+
+            if (cbits == 33)
+                break;
+
+            if (cbits < 2)
+                ones_count = cbits;
+            else {
+                for (mask = 1, ones_count = 0; --cbits; mask <<= 1)
+                    if (getbit (bs))
+                        ones_count |= mask;
+
+                ones_count |= mask;
+            }
+
+            ones_count += LIMIT_ONES;
+        }
+#endif
+
+        low = wps->w.holding_one;
+        wps->w.holding_one = ones_count & 1;
+        wps->w.holding_zero = ~ones_count & 1;
+        ones_count = (ones_count >> 1) + low;
+
+        if (ones_count == 0) {
+            low = 0;
+            high = GET_MED (0) - 1;
+            DEC_MED0 ();
+        }
+        else {
+            low = GET_MED (0);
+            INC_MED0 ();
+
+            if (ones_count == 1) {
+                high = low + GET_MED (1) - 1;
+                DEC_MED1 ();
+            }
+            else {
+                low += GET_MED (1);
+                INC_MED1 ();
+
+                if (ones_count == 2) {
+                    high = low + GET_MED (2) - 1;
+                    DEC_MED2 ();
+                }
+                else {
+                    low += (ones_count - 2) * GET_MED (2);
+                    high = low + GET_MED (2) - 1;
+                    INC_MED2 ();
+                }
+            }
+        }
+
+        low += read_code (bs, high - low);
+        buffer [csamples] = (getbit (bs)) ? ~low : low;
+    }
+
+    return (wps->wphdr.flags & MONO_DATA) ? csamples : (csamples / 2);
+}
+
+// Read a single unsigned value from the specified bitstream with a value
+// from 0 to maxcode. If there are exactly a power of two number of possible
+// codes then this will read a fixed number of bits; otherwise it reads the
+// minimum number of bits and then determines whether another bit is needed
+// to define the code.
+
+static uint32_t __inline read_code (Bitstream *bs, uint32_t maxcode)
+{
+    unsigned long local_sr;
+    uint32_t extras, code;
+    int bitcount;
+
+    if (maxcode < 2)
+        return maxcode ? getbit (bs) : 0;
+
+    bitcount = count_bits (maxcode);
+#ifdef USE_BITMASK_TABLES
+    extras = bitset [bitcount] - maxcode - 1;
+#else
+    extras = (1 << bitcount) - maxcode - 1;
+#endif
+
+    local_sr = bs->sr;
+
+    while (bs->bc < bitcount) {
+        if (++(bs->ptr) == bs->end)
+            bs->wrap (bs);
+
+        local_sr |= (long)*(bs->ptr) << bs->bc;
+        bs->bc += sizeof (*(bs->ptr)) * 8;
+    }
+
+#ifdef USE_BITMASK_TABLES
+    if ((code = local_sr & bitmask [bitcount - 1]) >= extras)
+#else
+    if ((code = local_sr & ((1 << (bitcount - 1)) - 1)) >= extras)
+#endif
+        code = (code << 1) - extras + ((local_sr >> (bitcount - 1)) & 1);
+    else
+        bitcount--;
+
+    if (sizeof (local_sr) < 8 && bs->bc > sizeof (local_sr) * 8) {
+        bs->bc -= bitcount;
+        bs->sr = *(bs->ptr) >> (sizeof (*(bs->ptr)) * 8 - bs->bc);
+    }
+    else {
+        bs->bc -= bitcount;
+        bs->sr = local_sr >> bitcount;
+    }
+
+    return code;
+}
diff --git a/third_party/wavpack/src/tag_utils.c b/third_party/wavpack/src/tag_utils.c
new file mode 100644
index 0000000..f98e1dd
--- /dev/null
+++ b/third_party/wavpack/src/tag_utils.c
@@ -0,0 +1,597 @@
+////////////////////////////////////////////////////////////////////////////
+//                           **** WAVPACK ****                            //
+//                  Hybrid Lossless Wavefile Compressor                   //
+//              Copyright (c) 1998 - 2013 Conifer Software.               //
+//                          All Rights Reserved.                          //
+//      Distributed under the BSD Software License (see license.txt)      //
+////////////////////////////////////////////////////////////////////////////
+
+// tag_utils.c
+
+// This module provides the high-level API for creating, reading and editing
+// APEv2 tags on WavPack files. Read-only support is also provided for ID3v1
+// tags, but their use is not recommended.
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "wavpack_local.h"
+
+#ifdef _WIN32
+#define stricmp(x,y) _stricmp(x,y)
+#else
+#define stricmp strcasecmp
+#endif
+
+static int get_ape_tag_item (M_Tag *m_tag, const char *item, char *value, int size, int type);
+static int get_id3_tag_item (M_Tag *m_tag, const char *item, char *value, int size);
+static int get_ape_tag_item_indexed (M_Tag *m_tag, int index, char *item, int size, int type);
+static int get_id3_tag_item_indexed (M_Tag *m_tag, int index, char *item, int size);
+static int append_ape_tag_item (WavpackContext *wpc, const char *item, const char *value, int vsize, int type);
+static int write_tag_blockout (WavpackContext *wpc);
+static int write_tag_reader (WavpackContext *wpc);
+static void tagcpy (char *dest, char *src, int tag_size);
+static int tagdata (char *src, int tag_size);
+
+//////////////////// Global functions part of external API /////////////////////////
+
+// Count and return the total number of tag items in the specified file.
+
+int WavpackGetNumTagItems (WavpackContext *wpc)
+{
+    int i = 0;
+
+    while (WavpackGetTagItemIndexed (wpc, i, NULL, 0))
+        ++i;
+
+    return i;
+}
+
+// Count and return the total number of binary tag items in the specified file. This applies
+// only to APEv2 tags and was implemented as a separate function to avoid breaking the old API.
+
+int WavpackGetNumBinaryTagItems (WavpackContext *wpc)
+{
+    int i = 0;
+
+    while (WavpackGetBinaryTagItemIndexed (wpc, i, NULL, 0))
+        ++i;
+
+    return i;
+}
+
+// Attempt to get the specified item from the specified file's ID3v1 or APEv2
+// tag. The "size" parameter specifies the amount of space available at "value",
+// if the desired item will not fit in this space then ellipses (...) will
+// be appended and the string terminated. Only text data are supported. The
+// actual length of the string is returned (or 0 if no matching value found).
+// Note that with APEv2 tags the length might not be the same as the number of
+// characters because UTF-8 encoding is used. Also, APEv2 tags can have multiple
+// (NULL separated) strings for a single value (this is why the length is
+// returned). If this function is called with a NULL "value" pointer (or a
+// zero "length") then only the actual length of the value data is returned
+// (not counting the terminating NULL). This can be used to determine the
+// actual memory to be allocated beforehand.
+
+int WavpackGetTagItem (WavpackContext *wpc, const char *item, char *value, int size)
+{
+    M_Tag *m_tag = &wpc->m_tag;
+
+    if (value && size)
+        *value = 0;
+
+    if (m_tag->ape_tag_hdr.ID [0] == 'A')
+        return get_ape_tag_item (m_tag, item, value, size, APE_TAG_TYPE_TEXT);
+    else if (m_tag->id3_tag.tag_id [0] == 'T')
+        return get_id3_tag_item (m_tag, item, value, size);
+    else
+        return 0;
+}
+
+// Attempt to get the specified binary item from the specified file's APEv2
+// tag. The "size" parameter specifies the amount of space available at "value".
+// If the desired item will not fit in this space then nothing will be copied
+// and 0 will be returned, otherwise the actual size will be returned. If this
+// function is called with a NULL "value" pointer (or a zero "length") then only
+// the actual length of the value data is returned and can be used to determine
+// the actual memory to be allocated beforehand.
+
+int WavpackGetBinaryTagItem (WavpackContext *wpc, const char *item, char *value, int size)
+{
+    M_Tag *m_tag = &wpc->m_tag;
+
+    if (value && size)
+        *value = 0;
+
+    if (m_tag->ape_tag_hdr.ID [0] == 'A')
+        return get_ape_tag_item (m_tag, item, value, size, APE_TAG_TYPE_BINARY);
+    else
+        return 0;
+}
+
+// This function looks up the tag item name by index and is used when the
+// application wants to access all the items in the file's ID3v1 or APEv2 tag.
+// Note that this function accesses only the item's name; WavpackGetTagItem()
+// still must be called to get the actual value. The "size" parameter specifies
+// the amount of space available at "item", if the desired item will not fit in
+// this space then ellipses (...) will be appended and the string terminated.
+// The actual length of the string is returned (or 0 if no item exists for
+// index). If this function is called with a NULL "value" pointer (or a
+// zero "length") then only the actual length of the item name is returned
+// (not counting the terminating NULL). This can be used to determine the
+// actual memory to be allocated beforehand. For binary tag values use the
+// otherwise identical WavpackGetBinaryTagItemIndexed ();
+
+int WavpackGetTagItemIndexed (WavpackContext *wpc, int index, char *item, int size)
+{
+    M_Tag *m_tag = &wpc->m_tag;
+
+    if (item && size)
+        *item = 0;
+
+    if (m_tag->ape_tag_hdr.ID [0] == 'A')
+        return get_ape_tag_item_indexed (m_tag, index, item, size, APE_TAG_TYPE_TEXT);
+    else if (m_tag->id3_tag.tag_id [0] == 'T')
+        return get_id3_tag_item_indexed (m_tag, index, item, size);
+    else
+        return 0;
+}
+
+int WavpackGetBinaryTagItemIndexed (WavpackContext *wpc, int index, char *item, int size)
+{
+    M_Tag *m_tag = &wpc->m_tag;
+
+    if (item && size)
+        *item = 0;
+
+    if (m_tag->ape_tag_hdr.ID [0] == 'A')
+        return get_ape_tag_item_indexed (m_tag, index, item, size, APE_TAG_TYPE_BINARY);
+    else
+        return 0;
+}
+
+// These two functions are used to append APEv2 tags to WavPack files; one is
+// for text values (UTF-8 encoded) and the other is for binary values. If no tag
+// has been started, then an empty one will be allocated first. When finished,
+// use WavpackWriteTag() to write the completed tag to the file. The purpose of
+// the passed size parameter is obvious for binary values, but might not be for
+// text values. Keep in mind that APEv2 text values can have multiple values
+// that are NULL separated, so the size is required to know the extent of the
+// value (although the final terminating NULL is not included in the passed
+// size). If the specified item already exists, it will be replaced with the
+// new value. ID3v1 tags are not supported.
+
+int WavpackAppendTagItem (WavpackContext *wpc, const char *item, const char *value, int vsize)
+{
+    while (WavpackDeleteTagItem (wpc, item));
+    return append_ape_tag_item (wpc, item, value, vsize, APE_TAG_TYPE_TEXT);
+}
+
+int WavpackAppendBinaryTagItem (WavpackContext *wpc, const char *item, const char *value, int vsize)
+{
+    while (WavpackDeleteTagItem (wpc, item));
+    return append_ape_tag_item (wpc, item, value, vsize, APE_TAG_TYPE_BINARY);
+}
+
+// Delete the specified tag item from the APEv2 tag on the specified WavPack file
+// (fields cannot be deleted from ID3v1 tags). A return value of TRUE indicates
+// that the item was found and successfully deleted.
+
+int WavpackDeleteTagItem (WavpackContext *wpc, const char *item)
+{
+    M_Tag *m_tag = &wpc->m_tag;
+
+    if (m_tag->ape_tag_hdr.ID [0] == 'A') {
+        unsigned char *p = m_tag->ape_tag_data;
+        unsigned char *q = p + m_tag->ape_tag_hdr.length - sizeof (APE_Tag_Hdr);
+        int i;
+
+        for (i = 0; i < m_tag->ape_tag_hdr.item_count; ++i) {
+            int vsize, isize;
+
+            vsize = p[0] + (p[1] << 8) + (p[2] << 16) + (p[3] << 24); p += 8;   // skip flags because we don't need them
+            for (isize = 0; p[isize] && p + isize < q; ++isize);
+
+            if (vsize < 0 || vsize > m_tag->ape_tag_hdr.length || p + isize + vsize + 1 > q)
+                break;
+
+            if (isize && vsize && !stricmp (item, (char *) p)) {
+                unsigned char *d = p - 8;
+
+                p += isize + vsize + 1;
+
+                while (p < q)
+                    *d++ = *p++;
+
+                m_tag->ape_tag_hdr.length = (int32_t)(d - m_tag->ape_tag_data) + sizeof (APE_Tag_Hdr);
+                m_tag->ape_tag_hdr.item_count--;
+                return 1;
+            }
+            else
+                p += isize + vsize + 1;
+        }
+    }
+
+    return 0;
+}
+
+// Once a APEv2 tag has been created with WavpackAppendTag(), this function is
+// used to write the completed tag to the end of the WavPack file. Note that
+// this function uses the same "blockout" function that is used to write
+// regular WavPack blocks, although that's where the similarity ends. It is also
+// used to write tags that have been edited on existing files.
+
+int WavpackWriteTag (WavpackContext *wpc)
+{
+    if (wpc->blockout)      // this is the case for creating fresh WavPack files
+        return write_tag_blockout (wpc);
+    else                    // otherwise we are editing existing tags (OPEN_EDIT_TAGS)
+        return write_tag_reader (wpc);
+}
+
+////////////////////////// local static functions /////////////////////////////
+
+static int get_ape_tag_item (M_Tag *m_tag, const char *item, char *value, int size, int type)
+{
+    unsigned char *p = m_tag->ape_tag_data;
+    unsigned char *q = p + m_tag->ape_tag_hdr.length - sizeof (APE_Tag_Hdr);
+    int i;
+
+    for (i = 0; i < m_tag->ape_tag_hdr.item_count && q - p > 8; ++i) {
+        int vsize, flags, isize;
+
+        vsize = p[0] + (p[1] << 8) + (p[2] << 16) + (p[3] << 24); p += 4;
+        flags = p[0] + (p[1] << 8) + (p[2] << 16) + (p[3] << 24); p += 4;
+        for (isize = 0; p[isize] && p + isize < q; ++isize);
+
+        if (vsize < 0 || vsize > m_tag->ape_tag_hdr.length || p + isize + vsize + 1 > q)
+            break;
+
+        if (isize && vsize && !stricmp (item, (char *) p) && ((flags & 6) >> 1) == type) {
+
+            if (!value || !size)
+                return vsize;
+
+            if (type == APE_TAG_TYPE_BINARY) {
+                if (vsize <= size) {
+                    memcpy (value, p + isize + 1, vsize);
+                    return vsize;
+                }
+                else
+                    return 0;
+            }
+            else if (vsize < size) {
+                memcpy (value, p + isize + 1, vsize);
+                value [vsize] = 0;
+                return vsize;
+            }
+            else if (size >= 4) {
+                memcpy (value, p + isize + 1, size - 1);
+                value [size - 4] = value [size - 3] = value [size - 2] = '.';
+                value [size - 1] = 0;
+                return size - 1;
+            }
+            else
+                return 0;
+        }
+        else
+            p += isize + vsize + 1;
+    }
+
+    return 0;
+}
+
+static int get_id3_tag_item (M_Tag *m_tag, const char *item, char *value, int size)
+{
+    char lvalue [64];
+    int len;
+
+    lvalue [0] = 0;
+
+    if (!stricmp (item, "title"))
+        tagcpy (lvalue, m_tag->id3_tag.title, sizeof (m_tag->id3_tag.title));
+    else if (!stricmp (item, "artist"))
+        tagcpy (lvalue, m_tag->id3_tag.artist, sizeof (m_tag->id3_tag.artist));
+    else if (!stricmp (item, "album"))
+        tagcpy (lvalue, m_tag->id3_tag.album, sizeof (m_tag->id3_tag.album));
+    else if (!stricmp (item, "year"))
+        tagcpy (lvalue, m_tag->id3_tag.year, sizeof (m_tag->id3_tag.year));
+    else if (!stricmp (item, "comment"))
+        tagcpy (lvalue, m_tag->id3_tag.comment, sizeof (m_tag->id3_tag.comment));
+    else if (!stricmp (item, "track") && m_tag->id3_tag.comment [29] && !m_tag->id3_tag.comment [28])
+        sprintf (lvalue, "%d", m_tag->id3_tag.comment [29]);
+    else
+        return 0;
+
+    len = (int) strlen (lvalue);
+
+    if (!value || !size)
+        return len;
+
+    if (len < size) {
+        strcpy (value, lvalue);
+        return len;
+    }
+    else if (size >= 4) {
+        strncpy (value, lvalue, size - 1);
+        value [size - 4] = value [size - 3] = value [size - 2] = '.';
+        value [size - 1] = 0;
+        return size - 1;
+    }
+    else
+        return 0;
+}
+
+static int get_ape_tag_item_indexed (M_Tag *m_tag, int index, char *item, int size, int type)
+{
+    unsigned char *p = m_tag->ape_tag_data;
+    unsigned char *q = p + m_tag->ape_tag_hdr.length - sizeof (APE_Tag_Hdr);
+    int i;
+
+    for (i = 0; i < m_tag->ape_tag_hdr.item_count && index >= 0 && q - p > 8; ++i) {
+        int vsize, flags, isize;
+
+        vsize = p[0] + (p[1] << 8) + (p[2] << 16) + (p[3] << 24); p += 4;
+        flags = p[0] + (p[1] << 8) + (p[2] << 16) + (p[3] << 24); p += 4;
+        for (isize = 0; p[isize] && p + isize < q; ++isize);
+
+        if (vsize < 0 || vsize > m_tag->ape_tag_hdr.length || p + isize + vsize + 1 > q)
+            break;
+
+        if (isize && vsize && ((flags & 6) >> 1) == type && !index--) {
+
+            if (!item || !size)
+                return isize;
+
+            if (isize < size) {
+                memcpy (item, p, isize);
+                item [isize] = 0;
+                return isize;
+            }
+            else if (size >= 4) {
+                memcpy (item, p, size - 1);
+                item [size - 4] = item [size - 3] = item [size - 2] = '.';
+                item [size - 1] = 0;
+                return size - 1;
+            }
+            else
+                return 0;
+        }
+        else
+            p += isize + vsize + 1;
+    }
+
+    return 0;
+}
+
+static int get_id3_tag_item_indexed (M_Tag *m_tag, int index, char *item, int size)
+{
+    char lvalue [16];
+    int len;
+
+    lvalue [0] = 0;
+
+    if (tagdata (m_tag->id3_tag.title, sizeof (m_tag->id3_tag.title)) && !index--)
+        strcpy (lvalue, "Title");
+    else if (tagdata (m_tag->id3_tag.artist, sizeof (m_tag->id3_tag.artist)) && !index--)
+        strcpy (lvalue, "Artist");
+    else if (tagdata (m_tag->id3_tag.album, sizeof (m_tag->id3_tag.album)) && !index--)
+        strcpy (lvalue, "Album");
+    else if (tagdata (m_tag->id3_tag.year, sizeof (m_tag->id3_tag.year)) && !index--)
+        strcpy (lvalue, "Year");
+    else if (tagdata (m_tag->id3_tag.comment, sizeof (m_tag->id3_tag.comment)) && !index--)
+        strcpy (lvalue, "Comment");
+    else if (m_tag->id3_tag.comment [29] && !m_tag->id3_tag.comment [28] && !index--)
+        strcpy (lvalue, "Track");
+    else
+        return 0;
+
+    len = (int) strlen (lvalue);
+
+    if (!item || !size)
+        return len;
+
+    if (len < size) {
+        strcpy (item, lvalue);
+        return len;
+    }
+    else if (size >= 4) {
+        strncpy (item, lvalue, size - 1);
+        item [size - 4] = item [size - 3] = item [size - 2] = '.';
+        item [size - 1] = 0;
+        return size - 1;
+    }
+    else
+        return 0;
+}
+
+static int append_ape_tag_item (WavpackContext *wpc, const char *item, const char *value, int vsize, int type)
+{
+    M_Tag *m_tag = &wpc->m_tag;
+    int isize = (int) strlen (item);
+
+    if (!m_tag->ape_tag_hdr.ID [0]) {
+        strncpy (m_tag->ape_tag_hdr.ID, "APETAGEX", sizeof (m_tag->ape_tag_hdr.ID));
+        m_tag->ape_tag_hdr.version = 2000;
+        m_tag->ape_tag_hdr.length = sizeof (m_tag->ape_tag_hdr);
+        m_tag->ape_tag_hdr.item_count = 0;
+        m_tag->ape_tag_hdr.flags = APE_TAG_CONTAINS_HEADER;  // we will include header on tags we originate
+    }
+
+    if (m_tag->ape_tag_hdr.ID [0] == 'A') {
+        int new_item_len = vsize + isize + 9, flags = type << 1;
+        unsigned char *p;
+
+        if (m_tag->ape_tag_hdr.length + new_item_len > APE_TAG_MAX_LENGTH) {
+            strcpy (wpc->error_message, "APEv2 tag exceeds maximum allowed length!");
+            return FALSE;
+        }
+
+        m_tag->ape_tag_hdr.item_count++;
+        m_tag->ape_tag_hdr.length += new_item_len;
+        p = m_tag->ape_tag_data = (unsigned char*)realloc (m_tag->ape_tag_data, m_tag->ape_tag_hdr.length);
+        p += m_tag->ape_tag_hdr.length - sizeof (APE_Tag_Hdr) - new_item_len;
+
+        *p++ = (unsigned char) vsize;
+        *p++ = (unsigned char) (vsize >> 8);
+        *p++ = (unsigned char) (vsize >> 16);
+        *p++ = (unsigned char) (vsize >> 24);
+
+        *p++ = (unsigned char) flags;
+        *p++ = (unsigned char) (flags >> 8);
+        *p++ = (unsigned char) (flags >> 16);
+        *p++ = (unsigned char) (flags >> 24);
+
+        strcpy ((char *) p, item);
+        p += isize + 1;
+        memcpy (p, value, vsize);
+
+        return TRUE;
+    }
+    else
+        return FALSE;
+}
+
+// Append the stored APEv2 tag to the file being created using the "blockout" function callback.
+
+static int write_tag_blockout (WavpackContext *wpc)
+{
+    M_Tag *m_tag = &wpc->m_tag;
+    int result = TRUE;
+
+    if (m_tag->ape_tag_hdr.ID [0] == 'A' && m_tag->ape_tag_hdr.item_count) {
+
+        // only write header if it's specified in the flags
+
+        if (m_tag->ape_tag_hdr.flags & APE_TAG_CONTAINS_HEADER) {
+            m_tag->ape_tag_hdr.flags |= APE_TAG_THIS_IS_HEADER;
+            WavpackNativeToLittleEndian (&m_tag->ape_tag_hdr, APE_Tag_Hdr_Format);
+            result = wpc->blockout (wpc->wv_out, &m_tag->ape_tag_hdr, sizeof (m_tag->ape_tag_hdr));
+            WavpackLittleEndianToNative (&m_tag->ape_tag_hdr, APE_Tag_Hdr_Format);
+        }
+
+        if (m_tag->ape_tag_hdr.length > sizeof (m_tag->ape_tag_hdr))
+            result = wpc->blockout (wpc->wv_out, m_tag->ape_tag_data, m_tag->ape_tag_hdr.length - sizeof (m_tag->ape_tag_hdr));
+
+        m_tag->ape_tag_hdr.flags &= ~APE_TAG_THIS_IS_HEADER;    // this is NOT header
+        WavpackNativeToLittleEndian (&m_tag->ape_tag_hdr, APE_Tag_Hdr_Format);
+        result = wpc->blockout (wpc->wv_out, &m_tag->ape_tag_hdr, sizeof (m_tag->ape_tag_hdr));
+        WavpackLittleEndianToNative (&m_tag->ape_tag_hdr, APE_Tag_Hdr_Format);
+    }
+
+    if (!result)
+        strcpy (wpc->error_message, "can't write WavPack data, disk probably full!");
+
+    return result;
+}
+
+// Write the [potentially] edited tag to the existing WavPack file using the reader callback functions.
+
+static int write_tag_reader (WavpackContext *wpc)
+{
+    M_Tag *m_tag = &wpc->m_tag;
+    int32_t tag_size = 0;
+    int result;
+
+    // before we write an edited (or new) tag into an existing file, make sure it's safe and possible
+
+    if (m_tag->tag_begins_file) {
+        strcpy (wpc->error_message, "can't edit tags located at the beginning of files!");
+        return FALSE;
+    }
+
+    if (!wpc->reader->can_seek (wpc->wv_in)) {
+        strcpy (wpc->error_message, "can't edit tags on pipes or unseekable files!");
+        return FALSE;
+    }
+
+    if (!(wpc->open_flags & OPEN_EDIT_TAGS)) {
+        strcpy (wpc->error_message, "can't edit tags without OPEN_EDIT_TAGS flag!");
+        return FALSE;
+    }
+
+    if (m_tag->ape_tag_hdr.ID [0] == 'A' && m_tag->ape_tag_hdr.item_count &&
+        m_tag->ape_tag_hdr.length > sizeof (m_tag->ape_tag_hdr))
+            tag_size = m_tag->ape_tag_hdr.length;
+
+    // only write header if it's specified in the flags
+
+    if (tag_size && (m_tag->ape_tag_hdr.flags & APE_TAG_CONTAINS_HEADER))
+        tag_size += sizeof (m_tag->ape_tag_hdr);
+
+    result = !wpc->reader->set_pos_rel (wpc->wv_in, m_tag->tag_file_pos, SEEK_END);
+
+    if (result && tag_size < -m_tag->tag_file_pos && !wpc->reader->truncate_here) {
+        int nullcnt = (int) (-m_tag->tag_file_pos - tag_size);
+        char zero [1] = { 0 };
+
+        while (nullcnt--)
+            wpc->reader->write_bytes (wpc->wv_in, &zero, 1);
+    }
+
+    if (result && tag_size) {
+        if (m_tag->ape_tag_hdr.flags & APE_TAG_CONTAINS_HEADER) {
+            m_tag->ape_tag_hdr.flags |= APE_TAG_THIS_IS_HEADER;
+            WavpackNativeToLittleEndian (&m_tag->ape_tag_hdr, APE_Tag_Hdr_Format);
+            result = (wpc->reader->write_bytes (wpc->wv_in, &m_tag->ape_tag_hdr, sizeof (m_tag->ape_tag_hdr)) == sizeof (m_tag->ape_tag_hdr));
+            WavpackLittleEndianToNative (&m_tag->ape_tag_hdr, APE_Tag_Hdr_Format);
+        }
+
+        result = (wpc->reader->write_bytes (wpc->wv_in, m_tag->ape_tag_data, m_tag->ape_tag_hdr.length - sizeof (m_tag->ape_tag_hdr)) == sizeof (m_tag->ape_tag_hdr));
+        m_tag->ape_tag_hdr.flags &= ~APE_TAG_THIS_IS_HEADER;    // this is NOT header
+        WavpackNativeToLittleEndian (&m_tag->ape_tag_hdr, APE_Tag_Hdr_Format);
+        result = (wpc->reader->write_bytes (wpc->wv_in, &m_tag->ape_tag_hdr, sizeof (m_tag->ape_tag_hdr)) == sizeof (m_tag->ape_tag_hdr));
+        WavpackLittleEndianToNative (&m_tag->ape_tag_hdr, APE_Tag_Hdr_Format);
+    }
+
+    if (result && tag_size < -m_tag->tag_file_pos && wpc->reader->truncate_here)
+        result = !wpc->reader->truncate_here (wpc->wv_in);
+
+    if (!result)
+        strcpy (wpc->error_message, "can't write WavPack data, disk probably full!");
+
+    return result;
+}
+
+// Copy the specified ID3v1 tag value (with specified field size) from the
+// source pointer to the destination, eliminating leading spaces and trailing
+// spaces and nulls.
+
+static void tagcpy (char *dest, char *src, int tag_size)
+{
+    char *s1 = src, *s2 = src + tag_size - 1;
+
+    if (*s2 && !s2 [-1])
+        s2--;
+
+    while (s1 <= s2)
+        if (*s1 == ' ')
+            ++s1;
+        else if (!*s2 || *s2 == ' ')
+            --s2;
+        else
+            break;
+
+    while (*s1 && s1 <= s2)
+        *dest++ = *s1++;
+
+    *dest = 0;
+}
+
+static int tagdata (char *src, int tag_size)
+{
+    char *s1 = src, *s2 = src + tag_size - 1;
+
+    if (*s2 && !s2 [-1])
+        s2--;
+
+    while (s1 <= s2)
+        if (*s1 == ' ')
+            ++s1;
+        else if (!*s2 || *s2 == ' ')
+            --s2;
+        else
+            break;
+
+    return (*s1 && s1 <= s2);
+}
diff --git a/third_party/wavpack/src/tags.c b/third_party/wavpack/src/tags.c
index 56403ec..21884ea 100644
--- a/third_party/wavpack/src/tags.c
+++ b/third_party/wavpack/src/tags.c
@@ -1,247 +1,23 @@
 ////////////////////////////////////////////////////////////////////////////
 //                           **** WAVPACK ****                            //
 //                  Hybrid Lossless Wavefile Compressor                   //
-//              Copyright (c) 1998 - 2009 Conifer Software.               //
+//              Copyright (c) 1998 - 2013 Conifer Software.               //
 //                          All Rights Reserved.                          //
 //      Distributed under the BSD Software License (see license.txt)      //
 ////////////////////////////////////////////////////////////////////////////
 
 // tags.c
 
-// This module provides support for reading and writing metadata tags.
+// This module provides support for reading metadata tags (either ID3v1 or
+// APEv2) from WavPack files. No actual creation or manipulation of the tags
+// is done in this module; this is just internal code to load the tags into
+// memory. The high-level API functions are in the tag_utils.c module.
 
 #include <stdlib.h>
 #include <string.h>
 
 #include "wavpack_local.h"
 
-#ifdef WIN32
-#define stricmp(x,y) _stricmp(x,y)
-#define fileno _fileno
-#else
-#define stricmp strcasecmp
-#endif
-
-#ifdef DEBUG_ALLOC
-#define malloc malloc_db
-#define realloc realloc_db
-#define free free_db
-void *malloc_db (uint32_t size);
-void *realloc_db (void *ptr, uint32_t size);
-void free_db (void *ptr);
-int32_t dump_alloc (void);
-#endif
-
-#ifndef NO_TAGS
-
-static int get_ape_tag_item (M_Tag *m_tag, const char *item, char *value, int size, int type);
-static int get_id3_tag_item (M_Tag *m_tag, const char *item, char *value, int size);
-static int get_ape_tag_item_indexed (M_Tag *m_tag, int index, char *item, int size, int type);
-static int get_id3_tag_item_indexed (M_Tag *m_tag, int index, char *item, int size);
-static int append_ape_tag_item (WavpackContext *wpc, const char *item, const char *value, int vsize, int type);
-static int write_tag_blockout (WavpackContext *wpc);
-static int write_tag_reader (WavpackContext *wpc);
-static void tagcpy (char *dest, char *src, int tag_size);
-static int tagdata (char *src, int tag_size);
-
-//////////////////// Global functions part of external API /////////////////////////
-
-// Count and return the total number of tag items in the specified file.
-
-int WavpackGetNumTagItems (WavpackContext *wpc)
-{
-    int i = 0;
-
-    while (WavpackGetTagItemIndexed (wpc, i, NULL, 0))
-        ++i;
-
-    return i;
-}
-
-// Count and return the total number of binary tag items in the specified file. This applies
-// only to APEv2 tags and was implemented as a separate function to avoid breaking the old API.
-
-int WavpackGetNumBinaryTagItems (WavpackContext *wpc)
-{
-    int i = 0;
-
-    while (WavpackGetBinaryTagItemIndexed (wpc, i, NULL, 0))
-        ++i;
-
-    return i;
-}
-
-// Attempt to get the specified item from the specified file's ID3v1 or APEv2
-// tag. The "size" parameter specifies the amount of space available at "value",
-// if the desired item will not fit in this space then ellipses (...) will
-// be appended and the string terminated. Only text data are supported. The
-// actual length of the string is returned (or 0 if no matching value found).
-// Note that with APEv2 tags the length might not be the same as the number of
-// characters because UTF-8 encoding is used. Also, APEv2 tags can have multiple
-// (NULL separated) strings for a single value (this is why the length is
-// returned). If this function is called with a NULL "value" pointer (or a
-// zero "length") then only the actual length of the value data is returned
-// (not counting the terminating NULL). This can be used to determine the
-// actual memory to be allocated beforehand.
-
-int WavpackGetTagItem (WavpackContext *wpc, const char *item, char *value, int size)
-{
-    M_Tag *m_tag = &wpc->m_tag;
-
-    if (value && size)
-        *value = 0;
-
-    if (m_tag->ape_tag_hdr.ID [0] == 'A')
-        return get_ape_tag_item (m_tag, item, value, size, APE_TAG_TYPE_TEXT);
-    else if (m_tag->id3_tag.tag_id [0] == 'T')
-        return get_id3_tag_item (m_tag, item, value, size);
-    else
-        return 0;
-}
-
-// Attempt to get the specified binary item from the specified file's APEv2
-// tag. The "size" parameter specifies the amount of space available at "value".
-// If the desired item will not fit in this space then nothing will be copied
-// and 0 will be returned, otherwise the actual size will be returned. If this
-// function is called with a NULL "value" pointer (or a zero "length") then only
-// the actual length of the value data is returned and can be used to determine
-// the actual memory to be allocated beforehand.
-
-int WavpackGetBinaryTagItem (WavpackContext *wpc, const char *item, char *value, int size)
-{
-    M_Tag *m_tag = &wpc->m_tag;
-
-    if (value && size)
-        *value = 0;
-
-    if (m_tag->ape_tag_hdr.ID [0] == 'A')
-        return get_ape_tag_item (m_tag, item, value, size, APE_TAG_TYPE_BINARY);
-    else
-        return 0;
-}
-
-// This function looks up the tag item name by index and is used when the
-// application wants to access all the items in the file's ID3v1 or APEv2 tag.
-// Note that this function accesses only the item's name; WavpackGetTagItem()
-// still must be called to get the actual value. The "size" parameter specifies
-// the amount of space available at "item", if the desired item will not fit in
-// this space then ellipses (...) will be appended and the string terminated.
-// The actual length of the string is returned (or 0 if no item exists for
-// index). If this function is called with a NULL "value" pointer (or a
-// zero "length") then only the actual length of the item name is returned
-// (not counting the terminating NULL). This can be used to determine the
-// actual memory to be allocated beforehand. For binary tag values use the
-// otherwise identical WavpackGetBinaryTagItemIndexed ();
-
-int WavpackGetTagItemIndexed (WavpackContext *wpc, int index, char *item, int size)
-{
-    M_Tag *m_tag = &wpc->m_tag;
-
-    if (item && size)
-        *item = 0;
-
-    if (m_tag->ape_tag_hdr.ID [0] == 'A')
-        return get_ape_tag_item_indexed (m_tag, index, item, size, APE_TAG_TYPE_TEXT);
-    else if (m_tag->id3_tag.tag_id [0] == 'T')
-        return get_id3_tag_item_indexed (m_tag, index, item, size);
-    else
-        return 0;
-}
-
-int WavpackGetBinaryTagItemIndexed (WavpackContext *wpc, int index, char *item, int size)
-{
-    M_Tag *m_tag = &wpc->m_tag;
-
-    if (item && size)
-        *item = 0;
-
-    if (m_tag->ape_tag_hdr.ID [0] == 'A')
-        return get_ape_tag_item_indexed (m_tag, index, item, size, APE_TAG_TYPE_BINARY);
-    else
-        return 0;
-}
-
-// These two functions are used to append APEv2 tags to WavPack files; one is
-// for text values (UTF-8 encoded) and the other is for binary values. If no tag
-// has been started, then an empty one will be allocated first. When finished,
-// use WavpackWriteTag() to write the completed tag to the file. The purpose of
-// the passed size parameter is obvious for binary values, but might not be for
-// text values. Keep in mind that APEv2 text values can have multiple values
-// that are NULL separated, so the size is required to know the extent of the
-// value (although the final terminating NULL is not included in the passed
-// size). If the specified item already exists, it will be replaced with the
-// new value. ID3v1 tags are not supported.
-
-int WavpackAppendTagItem (WavpackContext *wpc, const char *item, const char *value, int vsize)
-{
-    while (WavpackDeleteTagItem (wpc, item));
-    return append_ape_tag_item (wpc, item, value, vsize, APE_TAG_TYPE_TEXT);
-}
-
-int WavpackAppendBinaryTagItem (WavpackContext *wpc, const char *item, const char *value, int vsize)
-{
-    while (WavpackDeleteTagItem (wpc, item));
-    return append_ape_tag_item (wpc, item, value, vsize, APE_TAG_TYPE_BINARY);
-}
-
-// Delete the specified tag item from the APEv2 tag on the specified WavPack file
-// (fields cannot be deleted from ID3v1 tags). A return value of TRUE indicates
-// that the item was found and successfully deleted.
-
-int WavpackDeleteTagItem (WavpackContext *wpc, const char *item)
-{
-    M_Tag *m_tag = &wpc->m_tag;
-
-    if (m_tag->ape_tag_hdr.ID [0] == 'A') {
-        unsigned char *p = m_tag->ape_tag_data;
-        unsigned char *q = p + m_tag->ape_tag_hdr.length - sizeof (APE_Tag_Hdr);
-        int i;
-
-        for (i = 0; i < m_tag->ape_tag_hdr.item_count; ++i) {
-            int vsize, isize;
-
-            vsize = p[0] + (p[1] << 8) + (p[2] << 16) + (p[3] << 24); p += 8;   // skip flags because we don't need them
-            for (isize = 0; p[isize] && p + isize < q; ++isize);
-
-            if (vsize < 0 || vsize > m_tag->ape_tag_hdr.length || p + isize + vsize + 1 > q)
-                break;
-
-            if (isize && vsize && !stricmp (item, (char *) p)) {
-                unsigned char *d = p - 8;
-
-                p += isize + vsize + 1;
-
-                while (p < q)
-                    *d++ = *p++;
-
-                m_tag->ape_tag_hdr.length = (int32_t)(d - m_tag->ape_tag_data) + sizeof (APE_Tag_Hdr);
-                m_tag->ape_tag_hdr.item_count--;
-                return 1;
-            }
-            else
-                p += isize + vsize + 1;
-        }
-    }
-
-    return 0;
-}
-
-// Once a APEv2 tag has been created with WavpackAppendTag(), this function is
-// used to write the completed tag to the end of the WavPack file. Note that
-// this function uses the same "blockout" function that is used to write
-// regular WavPack blocks, although that's where the similarity ends. It is also
-// used to write tags that have been edited on existing files.
-
-int WavpackWriteTag (WavpackContext *wpc)
-{
-    if (wpc->blockout)      // this is the case for creating fresh WavPack files
-        return write_tag_blockout (wpc);
-    else                    // otherwise we are editing existing tags (OPEN_EDIT_TAGS)
-        return write_tag_reader (wpc);
-}
-
-//////// Utility functions provided to other modules (but not part of lib API) /////////
-
 // This function attempts to load an ID3v1 or APEv2 tag from the specified
 // file into the specified M_Tag structure. The ID3 tag fits in completely,
 // but an APEv2 tag is variable length and so space must be allocated here
@@ -278,12 +54,12 @@ int load_tag (WavpackContext *wpc)
         if (wpc->reader->read_bytes (wpc->wv_in, &m_tag->ape_tag_hdr, sizeof (APE_Tag_Hdr)) == sizeof (APE_Tag_Hdr) &&
             !strncmp (m_tag->ape_tag_hdr.ID, "APETAGEX", 8)) {
 
-                little_endian_to_native (&m_tag->ape_tag_hdr, APE_Tag_Hdr_Format);
+                WavpackLittleEndianToNative (&m_tag->ape_tag_hdr, APE_Tag_Hdr_Format);
 
                 if (m_tag->ape_tag_hdr.version == 2000 && m_tag->ape_tag_hdr.item_count &&
                     m_tag->ape_tag_hdr.length > sizeof (m_tag->ape_tag_hdr) &&
                     m_tag->ape_tag_hdr.length <= APE_TAG_MAX_LENGTH &&
-                    (m_tag->ape_tag_data = malloc (m_tag->ape_tag_hdr.length)) != NULL) {
+                    (m_tag->ape_tag_data = (unsigned char *)malloc (m_tag->ape_tag_hdr.length)) != NULL) {
 
                         ape_tag_items = m_tag->ape_tag_hdr.item_count;
                         ape_tag_length = m_tag->ape_tag_hdr.length;
@@ -315,7 +91,7 @@ int load_tag (WavpackContext *wpc)
                                         return FALSE;       // something's wrong...
                                 }
 
-                                little_endian_to_native (&m_tag->ape_tag_hdr, APE_Tag_Hdr_Format);
+                                WavpackLittleEndianToNative (&m_tag->ape_tag_hdr, APE_Tag_Hdr_Format);
 
                                 if (m_tag->ape_tag_hdr.version != 2000 || m_tag->ape_tag_hdr.item_count != ape_tag_items ||
                                     m_tag->ape_tag_hdr.length != ape_tag_length) {
@@ -401,366 +177,3 @@ void free_tag (M_Tag *m_tag)
         m_tag->ape_tag_data = NULL;
     }
 }
-
-////////////////////////// local static functions /////////////////////////////
-
-static int get_ape_tag_item (M_Tag *m_tag, const char *item, char *value, int size, int type)
-{
-    unsigned char *p = m_tag->ape_tag_data;
-    unsigned char *q = p + m_tag->ape_tag_hdr.length - sizeof (APE_Tag_Hdr);
-    int i;
-
-    for (i = 0; i < m_tag->ape_tag_hdr.item_count && q - p > 8; ++i) {
-        int vsize, flags, isize;
-
-        vsize = p[0] + (p[1] << 8) + (p[2] << 16) + (p[3] << 24); p += 4;
-        flags = p[0] + (p[1] << 8) + (p[2] << 16) + (p[3] << 24); p += 4;
-        for (isize = 0; p[isize] && p + isize < q; ++isize);
-
-        if (vsize < 0 || vsize > m_tag->ape_tag_hdr.length || p + isize + vsize + 1 > q)
-            break;
-
-        if (isize && vsize && !stricmp (item, (char *) p) && ((flags & 6) >> 1) == type) {
-
-            if (!value || !size)
-                return vsize;
-
-            if (type == APE_TAG_TYPE_BINARY) {
-                if (vsize <= size) {
-                    memcpy (value, p + isize + 1, vsize);
-                    return vsize;
-                }
-                else
-                    return 0;
-            }
-            else if (vsize < size) {
-                memcpy (value, p + isize + 1, vsize);
-                value [vsize] = 0;
-                return vsize;
-            }
-            else if (size >= 4) {
-                memcpy (value, p + isize + 1, size - 1);
-                value [size - 4] = value [size - 3] = value [size - 2] = '.';
-                value [size - 1] = 0;
-                return size - 1;
-            }
-            else
-                return 0;
-        }
-        else
-            p += isize + vsize + 1;
-    }
-
-    return 0;
-}
-
-static int get_id3_tag_item (M_Tag *m_tag, const char *item, char *value, int size)
-{
-    char lvalue [64];
-    int len;
-
-    lvalue [0] = 0;
-
-    if (!stricmp (item, "title"))
-        tagcpy (lvalue, m_tag->id3_tag.title, sizeof (m_tag->id3_tag.title));
-    else if (!stricmp (item, "artist"))
-        tagcpy (lvalue, m_tag->id3_tag.artist, sizeof (m_tag->id3_tag.artist));
-    else if (!stricmp (item, "album"))
-        tagcpy (lvalue, m_tag->id3_tag.album, sizeof (m_tag->id3_tag.album));
-    else if (!stricmp (item, "year"))
-        tagcpy (lvalue, m_tag->id3_tag.year, sizeof (m_tag->id3_tag.year));
-    else if (!stricmp (item, "comment"))
-        tagcpy (lvalue, m_tag->id3_tag.comment, sizeof (m_tag->id3_tag.comment));
-    else if (!stricmp (item, "track") && m_tag->id3_tag.comment [29] && !m_tag->id3_tag.comment [28])
-        sprintf (lvalue, "%d", m_tag->id3_tag.comment [29]);
-    else
-        return 0;
-
-    len = (int) strlen (lvalue);
-
-    if (!value || !size)
-        return len;
-
-    if (len < size) {
-        strcpy (value, lvalue);
-        return len;
-    }
-    else if (size >= 4) {
-        strncpy (value, lvalue, size - 1);
-        value [size - 4] = value [size - 3] = value [size - 2] = '.';
-        value [size - 1] = 0;
-        return size - 1;
-    }
-    else
-        return 0;
-}
-
-static int get_ape_tag_item_indexed (M_Tag *m_tag, int index, char *item, int size, int type)
-{
-    unsigned char *p = m_tag->ape_tag_data;
-    unsigned char *q = p + m_tag->ape_tag_hdr.length - sizeof (APE_Tag_Hdr);
-    int i;
-
-    for (i = 0; i < m_tag->ape_tag_hdr.item_count && index >= 0 && q - p > 8; ++i) {
-        int vsize, flags, isize;
-
-        vsize = p[0] + (p[1] << 8) + (p[2] << 16) + (p[3] << 24); p += 4;
-        flags = p[0] + (p[1] << 8) + (p[2] << 16) + (p[3] << 24); p += 4;
-        for (isize = 0; p[isize] && p + isize < q; ++isize);
-
-        if (vsize < 0 || vsize > m_tag->ape_tag_hdr.length || p + isize + vsize + 1 > q)
-            break;
-
-        if (isize && vsize && ((flags & 6) >> 1) == type && !index--) {
-
-            if (!item || !size)
-                return isize;
-
-            if (isize < size) {
-                memcpy (item, p, isize);
-                item [isize] = 0;
-                return isize;
-            }
-            else if (size >= 4) {
-                memcpy (item, p, size - 1);
-                item [size - 4] = item [size - 3] = item [size - 2] = '.';
-                item [size - 1] = 0;
-                return size - 1;
-            }
-            else
-                return 0;
-        }
-        else
-            p += isize + vsize + 1;
-    }
-
-    return 0;
-}
-
-static int get_id3_tag_item_indexed (M_Tag *m_tag, int index, char *item, int size)
-{
-    char lvalue [16];
-    int len;
-
-    lvalue [0] = 0;
-
-    if (tagdata (m_tag->id3_tag.title, sizeof (m_tag->id3_tag.title)) && !index--)
-        strcpy (lvalue, "Title");
-    else if (tagdata (m_tag->id3_tag.artist, sizeof (m_tag->id3_tag.artist)) && !index--)
-        strcpy (lvalue, "Artist");
-    else if (tagdata (m_tag->id3_tag.album, sizeof (m_tag->id3_tag.album)) && !index--)
-        strcpy (lvalue, "Album");
-    else if (tagdata (m_tag->id3_tag.year, sizeof (m_tag->id3_tag.year)) && !index--)
-        strcpy (lvalue, "Year");
-    else if (tagdata (m_tag->id3_tag.comment, sizeof (m_tag->id3_tag.comment)) && !index--)
-        strcpy (lvalue, "Comment");
-    else if (m_tag->id3_tag.comment [29] && !m_tag->id3_tag.comment [28] && !index--)
-        strcpy (lvalue, "Track");
-    else
-        return 0;
-
-    len = (int) strlen (lvalue);
-
-    if (!item || !size)
-        return len;
-
-    if (len < size) {
-        strcpy (item, lvalue);
-        return len;
-    }
-    else if (size >= 4) {
-        strncpy (item, lvalue, size - 1);
-        item [size - 4] = item [size - 3] = item [size - 2] = '.';
-        item [size - 1] = 0;
-        return size - 1;
-    }
-    else
-        return 0;
-}
-
-static int append_ape_tag_item (WavpackContext *wpc, const char *item, const char *value, int vsize, int type)
-{
-    M_Tag *m_tag = &wpc->m_tag;
-    int isize = (int) strlen (item);
-
-    if (!m_tag->ape_tag_hdr.ID [0]) {
-        strncpy (m_tag->ape_tag_hdr.ID, "APETAGEX", sizeof (m_tag->ape_tag_hdr.ID));
-        m_tag->ape_tag_hdr.version = 2000;
-        m_tag->ape_tag_hdr.length = sizeof (m_tag->ape_tag_hdr);
-        m_tag->ape_tag_hdr.item_count = 0;
-        m_tag->ape_tag_hdr.flags = APE_TAG_CONTAINS_HEADER;  // we will include header on tags we originate
-    }
-
-    if (m_tag->ape_tag_hdr.ID [0] == 'A') {
-        int new_item_len = vsize + isize + 9, flags = type << 1;
-        unsigned char *p;
-
-        if (m_tag->ape_tag_hdr.length + new_item_len > APE_TAG_MAX_LENGTH) {
-            strcpy (wpc->error_message, "APEv2 tag exceeds maximum allowed length!");
-            return FALSE;
-        }
-
-        m_tag->ape_tag_hdr.item_count++;
-        m_tag->ape_tag_hdr.length += new_item_len;
-        p = m_tag->ape_tag_data = realloc (m_tag->ape_tag_data, m_tag->ape_tag_hdr.length);
-        p += m_tag->ape_tag_hdr.length - sizeof (APE_Tag_Hdr) - new_item_len;
-
-        *p++ = (unsigned char) vsize;
-        *p++ = (unsigned char) (vsize >> 8);
-        *p++ = (unsigned char) (vsize >> 16);
-        *p++ = (unsigned char) (vsize >> 24);
-
-        *p++ = (unsigned char) flags;
-        *p++ = (unsigned char) (flags >> 8);
-        *p++ = (unsigned char) (flags >> 16);
-        *p++ = (unsigned char) (flags >> 24);
-
-        strcpy ((char *) p, item);
-        p += isize + 1;
-        memcpy (p, value, vsize);
-
-        return TRUE;
-    }
-    else
-        return FALSE;
-}
-
-static int write_tag_blockout (WavpackContext *wpc)
-{
-    M_Tag *m_tag = &wpc->m_tag;
-    int result = TRUE;
-
-    if (m_tag->ape_tag_hdr.ID [0] == 'A' && m_tag->ape_tag_hdr.item_count) {
-
-        // only write header if it's specified in the flags
-
-        if (m_tag->ape_tag_hdr.flags & APE_TAG_CONTAINS_HEADER) {
-            m_tag->ape_tag_hdr.flags |= APE_TAG_THIS_IS_HEADER;
-            native_to_little_endian (&m_tag->ape_tag_hdr, APE_Tag_Hdr_Format);
-            result = wpc->blockout (wpc->wv_out, &m_tag->ape_tag_hdr, sizeof (m_tag->ape_tag_hdr));
-            little_endian_to_native (&m_tag->ape_tag_hdr, APE_Tag_Hdr_Format);
-        }
-
-        if (m_tag->ape_tag_hdr.length > sizeof (m_tag->ape_tag_hdr))
-            result = wpc->blockout (wpc->wv_out, m_tag->ape_tag_data, m_tag->ape_tag_hdr.length - sizeof (m_tag->ape_tag_hdr));
-
-        m_tag->ape_tag_hdr.flags &= ~APE_TAG_THIS_IS_HEADER;    // this is NOT header
-        native_to_little_endian (&m_tag->ape_tag_hdr, APE_Tag_Hdr_Format);
-        result = wpc->blockout (wpc->wv_out, &m_tag->ape_tag_hdr, sizeof (m_tag->ape_tag_hdr));
-        little_endian_to_native (&m_tag->ape_tag_hdr, APE_Tag_Hdr_Format);
-    }
-
-    if (!result)
-        strcpy (wpc->error_message, "can't write WavPack data, disk probably full!");
-
-    return result;
-}
-
-static int write_tag_reader (WavpackContext *wpc)
-{
-    M_Tag *m_tag = &wpc->m_tag;
-    int32_t tag_size = 0;
-    int result;
-
-    // before we write an edited (or new) tag into an existing file, make sure it's safe and possible
-
-    if (m_tag->tag_begins_file) {
-        strcpy (wpc->error_message, "can't edit tags located at the beginning of files!");
-        return FALSE;
-    }
-
-    if (!wpc->reader->can_seek (wpc->wv_in)) {
-        strcpy (wpc->error_message, "can't edit tags on pipes or unseekable files!");
-        return FALSE;
-    }
-
-    if (!(wpc->open_flags & OPEN_EDIT_TAGS)) {
-        strcpy (wpc->error_message, "can't edit tags without OPEN_EDIT_TAGS flag!");
-        return FALSE;
-    }
-
-    if (m_tag->ape_tag_hdr.ID [0] == 'A' && m_tag->ape_tag_hdr.item_count &&
-        m_tag->ape_tag_hdr.length > sizeof (m_tag->ape_tag_hdr))
-            tag_size = m_tag->ape_tag_hdr.length;
-
-    // only write header if it's specified in the flags
-
-    if (m_tag->ape_tag_hdr.flags & APE_TAG_CONTAINS_HEADER)
-        tag_size += sizeof (m_tag->ape_tag_hdr);
-
-    result = !wpc->reader->set_pos_rel (wpc->wv_in, m_tag->tag_file_pos, SEEK_END);
-
-    if (result && tag_size < -m_tag->tag_file_pos) {
-        int nullcnt = -m_tag->tag_file_pos - tag_size;
-        char zero [1] = { 0 };
-
-        while (nullcnt--)
-            wpc->reader->write_bytes (wpc->wv_in, &zero, 1);
-    }
-
-    if (result && tag_size) {
-        if (m_tag->ape_tag_hdr.flags & APE_TAG_CONTAINS_HEADER) {
-            m_tag->ape_tag_hdr.flags |= APE_TAG_THIS_IS_HEADER;
-            native_to_little_endian (&m_tag->ape_tag_hdr, APE_Tag_Hdr_Format);
-            result = (wpc->reader->write_bytes (wpc->wv_in, &m_tag->ape_tag_hdr, sizeof (m_tag->ape_tag_hdr)) == sizeof (m_tag->ape_tag_hdr));
-            little_endian_to_native (&m_tag->ape_tag_hdr, APE_Tag_Hdr_Format);
-        }
-
-        result = (wpc->reader->write_bytes (wpc->wv_in, m_tag->ape_tag_data, m_tag->ape_tag_hdr.length - sizeof (m_tag->ape_tag_hdr)) == sizeof (m_tag->ape_tag_hdr));
-        m_tag->ape_tag_hdr.flags &= ~APE_TAG_THIS_IS_HEADER;    // this is NOT header
-        native_to_little_endian (&m_tag->ape_tag_hdr, APE_Tag_Hdr_Format);
-        result = (wpc->reader->write_bytes (wpc->wv_in, &m_tag->ape_tag_hdr, sizeof (m_tag->ape_tag_hdr)) == sizeof (m_tag->ape_tag_hdr));
-        little_endian_to_native (&m_tag->ape_tag_hdr, APE_Tag_Hdr_Format);
-    }
-
-    if (!result)
-        strcpy (wpc->error_message, "can't write WavPack data, disk probably full!");
-
-    return result;
-}
-
-// Copy the specified ID3v1 tag value (with specified field size) from the
-// source pointer to the destination, eliminating leading spaces and trailing
-// spaces and nulls.
-
-static void tagcpy (char *dest, char *src, int tag_size)
-{
-    char *s1 = src, *s2 = src + tag_size - 1;
-
-    if (*s2 && !s2 [-1])
-        s2--;
-
-    while (s1 <= s2)
-        if (*s1 == ' ')
-            ++s1;
-        else if (!*s2 || *s2 == ' ')
-            --s2;
-        else
-            break;
-
-    while (*s1 && s1 <= s2)
-        *dest++ = *s1++;
-
-    *dest = 0;
-}
-
-static int tagdata (char *src, int tag_size)
-{
-    char *s1 = src, *s2 = src + tag_size - 1;
-
-    if (*s2 && !s2 [-1])
-        s2--;
-
-    while (s1 <= s2)
-        if (*s1 == ' ')
-            ++s1;
-        else if (!*s2 || *s2 == ' ')
-            --s2;
-        else
-            break;
-
-    return (*s1 && s1 <= s2);
-}
-
-#endif
-
diff --git a/third_party/wavpack/src/unpack.c b/third_party/wavpack/src/unpack.c
index b296723..c5ae9f7 100644
--- a/third_party/wavpack/src/unpack.c
+++ b/third_party/wavpack/src/unpack.c
@@ -1,468 +1,56 @@
 ////////////////////////////////////////////////////////////////////////////
 //                           **** WAVPACK ****                            //
 //                  Hybrid Lossless Wavefile Compressor                   //
-//              Copyright (c) 1998 - 2006 Conifer Software.               //
+//              Copyright (c) 1998 - 2013 Conifer Software.               //
 //                          All Rights Reserved.                          //
 //      Distributed under the BSD Software License (see license.txt)      //
 ////////////////////////////////////////////////////////////////////////////
 
 // unpack.c
 
-// This module actually handles the decompression of the audio data, except
-// for the entropy decoding which is handled by the words? modules. For
-// maximum efficiency, the conversion is isolated to tight loops that handle
-// an entire buffer.
+// This module actually handles the decompression of the audio data, except for
+// the entropy decoding which is handled by the read_words.c module. For better
+// efficiency, the conversion is isolated to tight loops that handle an entire
+// buffer.
+
+#include <stdlib.h>
+#include <string.h>
 
 #include "wavpack_local.h"
 
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
+#ifdef OPT_ASM_X86
+    #define DECORR_STEREO_PASS_CONT unpack_decorr_stereo_pass_cont_x86
+    #define DECORR_STEREO_PASS_CONT_AVAILABLE unpack_cpu_has_feature_x86(CPU_FEATURE_MMX)
+    #define DECORR_MONO_PASS_CONT unpack_decorr_mono_pass_cont_x86
+#elif defined(OPT_ASM_X64) && (defined (_WIN64) || defined(__CYGWIN__) || defined(__MINGW64__))
+    #define DECORR_STEREO_PASS_CONT unpack_decorr_stereo_pass_cont_x64win
+    #define DECORR_STEREO_PASS_CONT_AVAILABLE 1
+    #define DECORR_MONO_PASS_CONT unpack_decorr_mono_pass_cont_x64win
+#elif defined(OPT_ASM_X64)
+    #define DECORR_STEREO_PASS_CONT unpack_decorr_stereo_pass_cont_x64
+    #define DECORR_STEREO_PASS_CONT_AVAILABLE 1
+    #define DECORR_MONO_PASS_CONT unpack_decorr_mono_pass_cont_x64
+#elif defined(OPT_ASM_ARM)
+    #define DECORR_STEREO_PASS_CONT unpack_decorr_stereo_pass_cont_armv7
+    #define DECORR_STEREO_PASS_CONT_AVAILABLE 1
+    #define DECORR_MONO_PASS_CONT unpack_decorr_mono_pass_cont_armv7
+#endif
 
-// This flag provides faster decoding speed at the expense of more code. The
-// improvement applies to 16-bit stereo lossless only.
+#ifdef DECORR_STEREO_PASS_CONT
+extern void DECORR_STEREO_PASS_CONT (struct decorr_pass *dpp, int32_t *buffer, int32_t sample_count, int32_t long_math);
+extern void DECORR_MONO_PASS_CONT (struct decorr_pass *dpp, int32_t *buffer, int32_t sample_count, int32_t long_math);
+#endif
 
-#define FAST_DECODE
+// This flag provides the functionality of terminating the decoding and muting
+// the output when a lossy sample appears to be corrupt. This is automatic
+// for lossless files because a corrupt sample is unambigious, but for lossy
+// data it might be possible for this to falsely trigger (although I have never
+// seen it).
 
 #define LOSSY_MUTE
 
-#ifdef DEBUG_ALLOC
-#define malloc malloc_db
-#define realloc realloc_db
-#define free free_db
-void *malloc_db (uint32_t size);
-void *realloc_db (void *ptr, uint32_t size);
-void free_db (void *ptr);
-int32_t dump_alloc (void);
-#endif
-
 ///////////////////////////// executable code ////////////////////////////////
 
-// This function initializes everything required to unpack a WavPack block
-// and must be called before unpack_samples() is called to obtain audio data.
-// It is assumed that the WavpackHeader has been read into the wps->wphdr
-// (in the current WavpackStream) and that the entire block has been read at
-// wps->blockbuff. If a correction file is available (wpc->wvc_flag = TRUE)
-// then the corresponding correction block must be read into wps->block2buff
-// and its WavpackHeader has overwritten the header at wps->wphdr. This is
-// where all the metadata blocks are scanned including those that contain
-// bitstream data.
-
-int unpack_init (WavpackContext *wpc)
-{
-    WavpackStream *wps = wpc->streams [wpc->current_stream];
-    unsigned char *blockptr, *block2ptr;
-    WavpackMetadata wpmd;
-
-    wps->mute_error = FALSE;
-    wps->crc = wps->crc_x = 0xffffffff;
-    CLEAR (wps->wvbits);
-    CLEAR (wps->wvcbits);
-    CLEAR (wps->wvxbits);
-    CLEAR (wps->decorr_passes);
-    CLEAR (wps->dc);
-    CLEAR (wps->w);
-
-    if (!(wps->wphdr.flags & MONO_FLAG) && wpc->config.num_channels && wps->wphdr.block_samples &&
-        (wpc->reduced_channels == 1 || wpc->config.num_channels == 1)) {
-            wps->mute_error = TRUE;
-            return FALSE;
-    }
-
-    if ((wps->wphdr.flags & UNKNOWN_FLAGS) || (wps->wphdr.flags & MONO_DATA) == MONO_DATA) {
-        wps->mute_error = TRUE;
-        return FALSE;
-    }
-
-    blockptr = wps->blockbuff + sizeof (WavpackHeader);
-
-    while (read_metadata_buff (&wpmd, wps->blockbuff, &blockptr))
-        if (!process_metadata (wpc, &wpmd)) {
-            wps->mute_error = TRUE;
-            return FALSE;
-        }
-
-    if (wps->wphdr.block_samples && wpc->wvc_flag && wps->block2buff) {
-        block2ptr = wps->block2buff + sizeof (WavpackHeader);
-
-        while (read_metadata_buff (&wpmd, wps->block2buff, &block2ptr))
-            if (!process_metadata (wpc, &wpmd)) {
-                wps->mute_error = TRUE;
-                return FALSE;
-            }
-    }
-
-    if (wps->wphdr.block_samples && !bs_is_open (&wps->wvbits)) {
-        if (bs_is_open (&wps->wvcbits))
-            strcpy (wpc->error_message, "can't unpack correction files alone!");
-
-        wps->mute_error = TRUE;
-        return FALSE;
-    }
-
-    if (wps->wphdr.block_samples && !bs_is_open (&wps->wvxbits)) {
-        if ((wps->wphdr.flags & INT32_DATA) && wps->int32_sent_bits)
-            wpc->lossy_blocks = TRUE;
-
-        if ((wps->wphdr.flags & FLOAT_DATA) &&
-            wps->float_flags & (FLOAT_EXCEPTIONS | FLOAT_ZEROS_SENT | FLOAT_SHIFT_SENT | FLOAT_SHIFT_SAME))
-                wpc->lossy_blocks = TRUE;
-    }
-
-    if (wps->wphdr.block_samples)
-        wps->sample_index = wps->wphdr.block_index;
-
-    return TRUE;
-}
-
-// This function initialzes the main bitstream for audio samples, which must
-// be in the "wv" file.
-
-int init_wv_bitstream (WavpackStream *wps, WavpackMetadata *wpmd)
-{
-    if (!wpmd->byte_length)
-        return FALSE;
-
-    bs_open_read (&wps->wvbits, wpmd->data, (unsigned char *) wpmd->data + wpmd->byte_length);
-    return TRUE;
-}
-
-// This function initialzes the "correction" bitstream for audio samples,
-// which currently must be in the "wvc" file.
-
-int init_wvc_bitstream (WavpackStream *wps, WavpackMetadata *wpmd)
-{
-    if (!wpmd->byte_length)
-        return FALSE;
-
-    bs_open_read (&wps->wvcbits, wpmd->data, (unsigned char *) wpmd->data + wpmd->byte_length);
-    return TRUE;
-}
-
-// This function initialzes the "extra" bitstream for audio samples which
-// contains the information required to losslessly decompress 32-bit float data
-// or integer data that exceeds 24 bits. This bitstream is in the "wv" file
-// for pure lossless data or the "wvc" file for hybrid lossless. This data
-// would not be used for hybrid lossy mode. There is also a 32-bit CRC stored
-// in the first 4 bytes of these blocks.
-
-int init_wvx_bitstream (WavpackStream *wps, WavpackMetadata *wpmd)
-{
-    unsigned char *cp = wpmd->data;
-
-    if (wpmd->byte_length <= 4)
-        return FALSE;
-
-    wps->crc_wvx = *cp++;
-    wps->crc_wvx |= (int32_t) *cp++ << 8;
-    wps->crc_wvx |= (int32_t) *cp++ << 16;
-    wps->crc_wvx |= (int32_t) *cp++ << 24;
-
-    bs_open_read (&wps->wvxbits, cp, (unsigned char *) wpmd->data + wpmd->byte_length);
-    return TRUE;
-}
-
-// Read decorrelation terms from specified metadata block into the
-// decorr_passes array. The terms range from -3 to 8, plus 17 & 18;
-// other values are reserved and generate errors for now. The delta
-// ranges from 0 to 7 with all values valid. Note that the terms are
-// stored in the opposite order in the decorr_passes array compared
-// to packing.
-
-int read_decorr_terms (WavpackStream *wps, WavpackMetadata *wpmd)
-{
-    int termcnt = wpmd->byte_length;
-    unsigned char *byteptr = wpmd->data;
-    struct decorr_pass *dpp;
-
-    if (termcnt > MAX_NTERMS)
-        return FALSE;
-
-    wps->num_terms = termcnt;
-
-    for (dpp = wps->decorr_passes + termcnt - 1; termcnt--; dpp--) {
-        dpp->term = (int)(*byteptr & 0x1f) - 5;
-        dpp->delta = (*byteptr++ >> 5) & 0x7;
-
-        if (!dpp->term || dpp->term < -3 || (dpp->term > MAX_TERM && dpp->term < 17) || dpp->term > 18)
-            return FALSE;
-    }
-
-    return TRUE;
-}
-
-// Read decorrelation weights from specified metadata block into the
-// decorr_passes array. The weights range +/-1024, but are rounded and
-// truncated to fit in signed chars for metadata storage. Weights are
-// separate for the two channels and are specified from the "last" term
-// (first during encode). Unspecified weights are set to zero.
-
-int read_decorr_weights (WavpackStream *wps, WavpackMetadata *wpmd)
-{
-    int termcnt = wpmd->byte_length, tcount;
-    char *byteptr = wpmd->data;
-    struct decorr_pass *dpp;
-
-    if (!(wps->wphdr.flags & MONO_DATA))
-        termcnt /= 2;
-
-    if (termcnt > wps->num_terms)
-        return FALSE;
-
-    for (tcount = wps->num_terms, dpp = wps->decorr_passes; tcount--; dpp++)
-        dpp->weight_A = dpp->weight_B = 0;
-
-    while (--dpp >= wps->decorr_passes && termcnt--) {
-        dpp->weight_A = restore_weight (*byteptr++);
-
-        if (!(wps->wphdr.flags & MONO_DATA))
-            dpp->weight_B = restore_weight (*byteptr++);
-    }
-
-    return TRUE;
-}
-
-// Read decorrelation samples from specified metadata block into the
-// decorr_passes array. The samples are signed 32-bit values, but are
-// converted to signed log2 values for storage in metadata. Values are
-// stored for both channels and are specified from the "last" term
-// (first during encode) with unspecified samples set to zero. The
-// number of samples stored varies with the actual term value, so
-// those must obviously come first in the metadata.
-
-int read_decorr_samples (WavpackStream *wps, WavpackMetadata *wpmd)
-{
-    unsigned char *byteptr = wpmd->data;
-    unsigned char *endptr = byteptr + wpmd->byte_length;
-    struct decorr_pass *dpp;
-    int tcount;
-
-    for (tcount = wps->num_terms, dpp = wps->decorr_passes; tcount--; dpp++) {
-        CLEAR (dpp->samples_A);
-        CLEAR (dpp->samples_B);
-    }
-
-    if (wps->wphdr.version == 0x402 && (wps->wphdr.flags & HYBRID_FLAG)) {
-        if (byteptr + (wps->wphdr.flags & MONO_DATA ? 2 : 4) > endptr)
-            return FALSE;
-
-        wps->dc.error [0] = exp2s ((short)(byteptr [0] + (byteptr [1] << 8)));
-        byteptr += 2;
-
-        if (!(wps->wphdr.flags & MONO_DATA)) {
-            wps->dc.error [1] = exp2s ((short)(byteptr [0] + (byteptr [1] << 8)));
-            byteptr += 2;
-        }
-    }
-
-    while (dpp-- > wps->decorr_passes && byteptr < endptr)
-        if (dpp->term > MAX_TERM) {
-            if (byteptr + (wps->wphdr.flags & MONO_DATA ? 4 : 8) > endptr)
-                return FALSE;
-
-            dpp->samples_A [0] = exp2s ((short)(byteptr [0] + (byteptr [1] << 8)));
-            dpp->samples_A [1] = exp2s ((short)(byteptr [2] + (byteptr [3] << 8)));
-            byteptr += 4;
-
-            if (!(wps->wphdr.flags & MONO_DATA)) {
-                dpp->samples_B [0] = exp2s ((short)(byteptr [0] + (byteptr [1] << 8)));
-                dpp->samples_B [1] = exp2s ((short)(byteptr [2] + (byteptr [3] << 8)));
-                byteptr += 4;
-            }
-        }
-        else if (dpp->term < 0) {
-            if (byteptr + 4 > endptr)
-                return FALSE;
-
-            dpp->samples_A [0] = exp2s ((short)(byteptr [0] + (byteptr [1] << 8)));
-            dpp->samples_B [0] = exp2s ((short)(byteptr [2] + (byteptr [3] << 8)));
-            byteptr += 4;
-        }
-        else {
-            int m = 0, cnt = dpp->term;
-
-            while (cnt--) {
-                if (byteptr + (wps->wphdr.flags & MONO_DATA ? 2 : 4) > endptr)
-                    return FALSE;
-
-                dpp->samples_A [m] = exp2s ((short)(byteptr [0] + (byteptr [1] << 8)));
-                byteptr += 2;
-
-                if (!(wps->wphdr.flags & MONO_DATA)) {
-                    dpp->samples_B [m] = exp2s ((short)(byteptr [0] + (byteptr [1] << 8)));
-                    byteptr += 2;
-                }
-
-                m++;
-            }
-        }
-
-    return byteptr == endptr;
-}
-
-// Read the shaping weights from specified metadata block into the
-// WavpackStream structure. Note that there must be two values (even
-// for mono streams) and that the values are stored in the same
-// manner as decorrelation weights. These would normally be read from
-// the "correction" file and are used for lossless reconstruction of
-// hybrid data.
-
-int read_shaping_info (WavpackStream *wps, WavpackMetadata *wpmd)
-{
-    if (wpmd->byte_length == 2) {
-        char *byteptr = wpmd->data;
-
-        wps->dc.shaping_acc [0] = (int32_t) restore_weight (*byteptr++) << 16;
-        wps->dc.shaping_acc [1] = (int32_t) restore_weight (*byteptr++) << 16;
-        return TRUE;
-    }
-    else if (wpmd->byte_length >= (wps->wphdr.flags & MONO_DATA ? 4 : 8)) {
-        unsigned char *byteptr = wpmd->data;
-
-        wps->dc.error [0] = exp2s ((short)(byteptr [0] + (byteptr [1] << 8)));
-        wps->dc.shaping_acc [0] = exp2s ((short)(byteptr [2] + (byteptr [3] << 8)));
-        byteptr += 4;
-
-        if (!(wps->wphdr.flags & MONO_DATA)) {
-            wps->dc.error [1] = exp2s ((short)(byteptr [0] + (byteptr [1] << 8)));
-            wps->dc.shaping_acc [1] = exp2s ((short)(byteptr [2] + (byteptr [3] << 8)));
-            byteptr += 4;
-        }
-
-        if (wpmd->byte_length == (wps->wphdr.flags & MONO_DATA ? 6 : 12)) {
-            wps->dc.shaping_delta [0] = exp2s ((short)(byteptr [0] + (byteptr [1] << 8)));
-
-            if (!(wps->wphdr.flags & MONO_DATA))
-                wps->dc.shaping_delta [1] = exp2s ((short)(byteptr [2] + (byteptr [3] << 8)));
-        }
-
-        return TRUE;
-    }
-
-    return FALSE;
-}
-
-// Read the int32 data from the specified metadata into the specified stream.
-// This data is used for integer data that has more than 24 bits of magnitude
-// or, in some cases, used to eliminate redundant bits from any audio stream.
-
-int read_int32_info (WavpackStream *wps, WavpackMetadata *wpmd)
-{
-    int bytecnt = wpmd->byte_length;
-    char *byteptr = wpmd->data;
-
-    if (bytecnt != 4)
-        return FALSE;
-
-    wps->int32_sent_bits = *byteptr++;
-    wps->int32_zeros = *byteptr++;
-    wps->int32_ones = *byteptr++;
-    wps->int32_dups = *byteptr;
-
-    return TRUE;
-}
-
-// Read multichannel information from metadata. The first byte is the total
-// number of channels and the following bytes represent the channel_mask
-// as described for Microsoft WAVEFORMATEX.
-
-int read_channel_info (WavpackContext *wpc, WavpackMetadata *wpmd)
-{
-    int bytecnt = wpmd->byte_length, shift = 0;
-    unsigned char *byteptr = wpmd->data;
-    uint32_t mask = 0;
-
-    if (!bytecnt || bytecnt > 6)
-        return FALSE;
-
-    if (!wpc->config.num_channels) {
-
-        if (bytecnt == 6) {
-            wpc->config.num_channels = (byteptr [0] | ((byteptr [2] & 0xf) << 8)) + 1;
-            wpc->max_streams = (byteptr [1] | ((byteptr [2] & 0xf0) << 4)) + 1;
-
-            if (wpc->config.num_channels < wpc->max_streams)
-                return FALSE;
-    
-            byteptr += 3;
-            mask = *byteptr++;
-            mask |= (uint32_t) *byteptr++ << 8;
-            mask |= (uint32_t) *byteptr << 16;
-        }
-        else {
-            wpc->config.num_channels = *byteptr++;
-
-            while (--bytecnt) {
-                mask |= (uint32_t) *byteptr++ << shift;
-                shift += 8;
-            }
-        }
-
-        if (wpc->config.num_channels > wpc->max_streams * 2)
-            return FALSE;
-
-        wpc->config.channel_mask = mask;
-    }
-
-    return TRUE;
-}
-
-// Read configuration information from metadata.
-
-int read_config_info (WavpackContext *wpc, WavpackMetadata *wpmd)
-{
-    int bytecnt = wpmd->byte_length;
-    unsigned char *byteptr = wpmd->data;
-
-    if (bytecnt >= 3) {
-        wpc->config.flags &= 0xff;
-        wpc->config.flags |= (int32_t) *byteptr++ << 8;
-        wpc->config.flags |= (int32_t) *byteptr++ << 16;
-        wpc->config.flags |= (int32_t) *byteptr++ << 24;
-
-        if (bytecnt >= 4 && (wpc->config.flags & CONFIG_EXTRA_MODE))
-            wpc->config.xmode = *byteptr;
-    }
-
-    return TRUE;
-}
-
-// Read non-standard sampling rate from metadata.
-
-int read_sample_rate (WavpackContext *wpc, WavpackMetadata *wpmd)
-{
-    int bytecnt = wpmd->byte_length;
-    unsigned char *byteptr = wpmd->data;
-
-    if (bytecnt == 3) {
-        wpc->config.sample_rate = (int32_t) *byteptr++;
-        wpc->config.sample_rate |= (int32_t) *byteptr++ << 8;
-        wpc->config.sample_rate |= (int32_t) *byteptr++ << 16;
-    }
-
-    return TRUE;
-}
-
-// Read wrapper data from metadata. Currently, this consists of the RIFF
-// header and trailer that wav files contain around the audio data but could
-// be used for other formats as well. Because WavPack files contain all the
-// information required for decoding and playback, this data can probably
-// be ignored except when an exact wavefile restoration is needed.
-
-int read_wrapper_data (WavpackContext *wpc, WavpackMetadata *wpmd)
-{
-    if ((wpc->open_flags & OPEN_WRAPPER) && wpc->wrapper_bytes < MAX_WRAPPER_BYTES) {
-        wpc->wrapper_data = realloc (wpc->wrapper_data, wpc->wrapper_bytes + wpmd->byte_length);
-        memcpy (wpc->wrapper_data + wpc->wrapper_bytes, wpmd->data, wpmd->byte_length);
-        wpc->wrapper_bytes += wpmd->byte_length;
-    }
-
-    return TRUE;
-}
-
-#ifndef NO_UNPACK
-
 // This monster actually unpacks the WavPack bitstream(s) into the specified
 // buffer as 32-bit integers or floats (depending on orignal data). Lossy
 // samples will be clipped to their original limits (i.e. 8-bit samples are
@@ -480,11 +68,7 @@ int read_wrapper_data (WavpackContext *wpc, WavpackMetadata *wpmd)
 // occurs or the end of the block is reached.
 
 static void decorr_stereo_pass (struct decorr_pass *dpp, int32_t *buffer, int32_t sample_count);
-static void decorr_stereo_pass_i (struct decorr_pass *dpp, int32_t *buffer, int32_t sample_count);
-static void decorr_stereo_pass_1717 (struct decorr_pass *dpp, int32_t *buffer, int32_t sample_count);
-static void decorr_stereo_pass_1718 (struct decorr_pass *dpp, int32_t *buffer, int32_t sample_count);
-static void decorr_stereo_pass_1818 (struct decorr_pass *dpp, int32_t *buffer, int32_t sample_count);
-static void decorr_stereo_pass_nn (struct decorr_pass *dpp, int32_t *buffer, int32_t sample_count);
+static void decorr_mono_pass (struct decorr_pass *dpp, int32_t *buffer, int32_t sample_count);
 static void fixup_samples (WavpackContext *wpc, int32_t *buffer, uint32_t sample_count);
 
 int32_t unpack_samples (WavpackContext *wpc, int32_t *buffer, uint32_t sample_count)
@@ -496,8 +80,14 @@ int32_t unpack_samples (WavpackContext *wpc, int32_t *buffer, uint32_t sample_co
     struct decorr_pass *dpp;
     int tcount, m = 0;
 
-    if (wps->sample_index + sample_count > wps->wphdr.block_index + wps->wphdr.block_samples)
-        sample_count = wps->wphdr.block_index + wps->wphdr.block_samples - wps->sample_index;
+    // don't attempt to decode past the end of the block, but watch out for overflow!
+
+    if (wps->sample_index + sample_count > GET_BLOCK_INDEX (wps->wphdr) + wps->wphdr.block_samples &&
+        GET_BLOCK_INDEX (wps->wphdr) + wps->wphdr.block_samples - wps->sample_index < sample_count)
+            sample_count = (uint32_t) (GET_BLOCK_INDEX (wps->wphdr) + wps->wphdr.block_samples - wps->sample_index);
+
+    if (GET_BLOCK_INDEX (wps->wphdr) > wps->sample_index || wps->wphdr.block_samples < sample_count)
+        wps->mute_error = TRUE;
 
     if (wps->mute_error) {
         if (wpc->reduced_channels == 1 || wpc->config.num_channels == 1 || (flags & MONO_FLAG))
@@ -510,7 +100,7 @@ int32_t unpack_samples (WavpackContext *wpc, int32_t *buffer, uint32_t sample_co
     }
 
     if ((flags & HYBRID_FLAG) && !wps->block2buff)
-        mute_limit *= 2;
+        mute_limit = (mute_limit * 2) + 128;
 
     //////////////// handle lossless or hybrid lossy mono data /////////////////
 
@@ -529,40 +119,40 @@ int32_t unpack_samples (WavpackContext *wpc, int32_t *buffer, uint32_t sample_co
         else
             i = get_words_lossless (wps, buffer, sample_count);
 
-        for (bptr = buffer; bptr < eptr;) {
-            read_word = *bptr;
-
+#ifdef DECORR_MONO_PASS_CONT
+        if (sample_count < 16)
+            for (tcount = wps->num_terms, dpp = wps->decorr_passes; tcount--; dpp++)
+                decorr_mono_pass (dpp, buffer, sample_count);
+        else
             for (tcount = wps->num_terms, dpp = wps->decorr_passes; tcount--; dpp++) {
-                int32_t sam, temp;
-                int k;
+                int pre_samples = (dpp->term > MAX_TERM) ? 2 : dpp->term;
 
-                if (dpp->term > MAX_TERM) {
-                    if (dpp->term & 1)
-                        sam = 2 * dpp->samples_A [0] - dpp->samples_A [1];
-                    else
-                        sam = dpp->samples_A [0] + ((dpp->samples_A [0] - dpp->samples_A [1]) >> 1);
+                decorr_mono_pass (dpp, buffer, pre_samples);
 
-                    dpp->samples_A [1] = dpp->samples_A [0];
-                    k = 0;
-                }
-                else {
-                    sam = dpp->samples_A [m];
-                    k = (m + dpp->term) & (MAX_TERM - 1);
-                }
-
-                temp = apply_weight (dpp->weight_A, sam) + read_word;
-                update_weight (dpp->weight_A, dpp->delta, sam, read_word);
-                dpp->samples_A [k] = read_word = temp;
+                DECORR_MONO_PASS_CONT (dpp, buffer + pre_samples, sample_count - pre_samples,
+                    ((flags & MAG_MASK) >> MAG_LSB) > 15);
             }
+#else
+        for (tcount = wps->num_terms, dpp = wps->decorr_passes; tcount--; dpp++)
+            decorr_mono_pass (dpp, buffer, sample_count);
+#endif
 
-            if (labs (read_word) > mute_limit) {
+#ifndef LOSSY_MUTE
+        if (!(flags & HYBRID_FLAG))
+#endif
+        for (bptr = buffer; bptr < eptr; ++bptr) {
+            if (labs (bptr [0]) > mute_limit) {
                 i = (uint32_t)(bptr - buffer);
                 break;
             }
 
-            m = (m + 1) & (MAX_TERM - 1);
-            crc += (crc << 1) + (*bptr++ = read_word);
+            crc = crc * 3 + bptr [0];
         }
+#ifndef LOSSY_MUTE
+        else
+            for (bptr = buffer; bptr < eptr; ++bptr)
+                crc = crc * 3 + bptr [0];
+#endif
     }
 
     /////////////// handle lossless or hybrid lossy stereo data ///////////////
@@ -583,36 +173,27 @@ int32_t unpack_samples (WavpackContext *wpc, int32_t *buffer, uint32_t sample_co
         else
             i = get_words_lossless (wps, buffer, sample_count);
 
-#ifdef FAST_DECODE
-        for (tcount = wps->num_terms, dpp = wps->decorr_passes; tcount--; dpp++)
-            if (((flags & MAG_MASK) >> MAG_LSB) >= 16)
+#ifdef DECORR_STEREO_PASS_CONT
+        if (sample_count < 16 || !DECORR_STEREO_PASS_CONT_AVAILABLE) {
+            for (tcount = wps->num_terms, dpp = wps->decorr_passes; tcount--; dpp++)
                 decorr_stereo_pass (dpp, buffer, sample_count);
-            else if (tcount && dpp [0].term == 17 && dpp [1].term == 17) {
-                decorr_stereo_pass_1717 (dpp, buffer, sample_count);
-                tcount--;
-                dpp++;
+
+            m = sample_count & (MAX_TERM - 1);
+        }
+        else
+            for (tcount = wps->num_terms, dpp = wps->decorr_passes; tcount--; dpp++) {
+                int pre_samples = (dpp->term < 0 || dpp->term > MAX_TERM) ? 2 : dpp->term;
+
+                decorr_stereo_pass (dpp, buffer, pre_samples);
+
+                DECORR_STEREO_PASS_CONT (dpp, buffer + pre_samples * 2, sample_count - pre_samples,
+                    ((flags & MAG_MASK) >> MAG_LSB) >= 16);
             }
-            else if (tcount && dpp [0].term == 17 && dpp [1].term == 18) {
-                decorr_stereo_pass_1718 (dpp, buffer, sample_count);
-                tcount--;
-                dpp++;
-            }
-            else if (tcount && dpp [0].term == 18 && dpp [1].term == 18) {
-                decorr_stereo_pass_1818 (dpp, buffer, sample_count);
-                tcount--;
-                dpp++;
-            }
-            else if (tcount && dpp [0].term >= 1 && dpp [0].term <= 7 &&
-                               dpp [1].term >= 1 && dpp [1].term <= 7) {
-                decorr_stereo_pass_nn (dpp, buffer, sample_count);
-                tcount--;
-                dpp++;
-            }
-            else
-                decorr_stereo_pass_i (dpp, buffer, sample_count);
 #else
         for (tcount = wps->num_terms, dpp = wps->decorr_passes; tcount--; dpp++)
             decorr_stereo_pass (dpp, buffer, sample_count);
+
+        m = sample_count & (MAX_TERM - 1);
 #endif
 
         if (flags & JOINT_STEREO)
@@ -624,13 +205,14 @@ int32_t unpack_samples (WavpackContext *wpc, int32_t *buffer, uint32_t sample_co
             for (bptr = buffer; bptr < eptr; bptr += 2)
                 crc += (crc << 3) + (bptr [0] << 1) + bptr [0] + bptr [1];
 
+#ifndef LOSSY_MUTE
+        if (!(flags & HYBRID_FLAG))
+#endif
         for (bptr = buffer; bptr < eptr; bptr += 16)
             if (labs (bptr [0]) > mute_limit || labs (bptr [1]) > mute_limit) {
                 i = (uint32_t)(bptr - buffer) / 2;
                 break;
             }
-
-        m = sample_count & (MAX_TERM - 1);
     }
 
     /////////////////// handle hybrid lossless mono data ////////////////////
@@ -686,10 +268,9 @@ int32_t unpack_samples (WavpackContext *wpc, int32_t *buffer, uint32_t sample_co
 
             crc += (crc << 1) + read_word;
 
-#ifdef LOSSY_MUTE
             if (labs (read_word) > mute_limit)
                 break;
-#endif
+
             *bptr++ = read_word;
         }
 
@@ -858,10 +439,9 @@ int32_t unpack_samples (WavpackContext *wpc, int32_t *buffer, uint32_t sample_co
                 right = right_c;
             }
 
-#ifdef LOSSY_MUTE
             if (labs (left) > mute_limit || labs (right) > mute_limit)
                 break;
-#endif
+
             crc += (crc << 3) + (left << 1) + left + right;
             *bptr++ = left;
             *bptr++ = right;
@@ -917,6 +497,67 @@ int32_t unpack_samples (WavpackContext *wpc, int32_t *buffer, uint32_t sample_co
     return i;
 }
 
+// General function to perform mono decorrelation pass on specified buffer
+// (although since this is the reverse function it might technically be called
+// "correlation" instead). This version handles all sample resolutions and
+// weight deltas. The dpp->samples_X[] data is returned normalized for term
+// values 1-8.
+
+static void decorr_mono_pass (struct decorr_pass *dpp, int32_t *buffer, int32_t sample_count)
+{
+    int32_t delta = dpp->delta, weight_A = dpp->weight_A;
+    int32_t *bptr, *eptr = buffer + sample_count, sam_A;
+    int m, k;
+
+    switch (dpp->term) {
+
+        case 17:
+            for (bptr = buffer; bptr < eptr; bptr++) {
+                sam_A = 2 * dpp->samples_A [0] - dpp->samples_A [1];
+                dpp->samples_A [1] = dpp->samples_A [0];
+                dpp->samples_A [0] = apply_weight (weight_A, sam_A) + bptr [0];
+                update_weight (weight_A, delta, sam_A, bptr [0]);
+                bptr [0] = dpp->samples_A [0];
+            }
+
+            break;
+
+        case 18:
+            for (bptr = buffer; bptr < eptr; bptr++) {
+                sam_A = (3 * dpp->samples_A [0] - dpp->samples_A [1]) >> 1;
+                dpp->samples_A [1] = dpp->samples_A [0];
+                dpp->samples_A [0] = apply_weight (weight_A, sam_A) + bptr [0];
+                update_weight (weight_A, delta, sam_A, bptr [0]);
+                bptr [0] = dpp->samples_A [0];
+            }
+
+            break;
+
+        default:
+            for (m = 0, k = dpp->term & (MAX_TERM - 1), bptr = buffer; bptr < eptr; bptr++) {
+                sam_A = dpp->samples_A [m];
+                dpp->samples_A [k] = apply_weight (weight_A, sam_A) + bptr [0];
+                update_weight (weight_A, delta, sam_A, bptr [0]);
+                bptr [0] = dpp->samples_A [k];
+                m = (m + 1) & (MAX_TERM - 1);
+                k = (k + 1) & (MAX_TERM - 1);
+            }
+
+            if (m) {
+                int32_t temp_samples [MAX_TERM];
+
+                memcpy (temp_samples, dpp->samples_A, sizeof (dpp->samples_A));
+
+                for (k = 0; k < MAX_TERM; k++, m++)
+                    dpp->samples_A [k] = temp_samples [m & (MAX_TERM - 1)];
+            }
+
+            break;
+    }
+
+    dpp->weight_A = weight_A;
+}
+
 // General function to perform stereo decorrelation pass on specified buffer
 // (although since this is the reverse function it might technically be called
 // "correlation" instead). This version handles all sample resolutions and
@@ -1028,245 +669,6 @@ static void decorr_stereo_pass (struct decorr_pass *dpp, int32_t *buffer, int32_
     }
 }
 
-#ifdef FAST_DECODE
-
-// This function is a specialized version of decorr_stereo_pass() that works
-// only with lower resolution data (<= 16-bit), but is otherwise identical.
-
-static void decorr_stereo_pass_i (struct decorr_pass *dpp, int32_t *buffer, int32_t sample_count)
-{
-    int32_t *bptr, *eptr = buffer + (sample_count * 2);
-    int m, k;
-
-    switch (dpp->term) {
-        case 17:
-            for (bptr = buffer; bptr < eptr; bptr += 2) {
-                int32_t sam, tmp;
-
-                sam = 2 * dpp->samples_A [0] - dpp->samples_A [1];
-                dpp->samples_A [1] = dpp->samples_A [0];
-                bptr [0] = dpp->samples_A [0] = apply_weight_i (dpp->weight_A, sam) + (tmp = bptr [0]);
-                update_weight (dpp->weight_A, dpp->delta, sam, tmp);
-
-                sam = 2 * dpp->samples_B [0] - dpp->samples_B [1];
-                dpp->samples_B [1] = dpp->samples_B [0];
-                bptr [1] = dpp->samples_B [0] = apply_weight_i (dpp->weight_B, sam) + (tmp = bptr [1]);
-                update_weight (dpp->weight_B, dpp->delta, sam, tmp);
-            }
-
-            break;
-
-        case 18:
-            for (bptr = buffer; bptr < eptr; bptr += 2) {
-                int32_t sam, tmp;
-
-                sam = dpp->samples_A [0] + ((dpp->samples_A [0] - dpp->samples_A [1]) >> 1);
-                dpp->samples_A [1] = dpp->samples_A [0];
-                bptr [0] = dpp->samples_A [0] = apply_weight_i (dpp->weight_A, sam) + (tmp = bptr [0]);
-                update_weight (dpp->weight_A, dpp->delta, sam, tmp);
-
-                sam = dpp->samples_B [0] + ((dpp->samples_B [0] - dpp->samples_B [1]) >> 1);
-                dpp->samples_B [1] = dpp->samples_B [0];
-                bptr [1] = dpp->samples_B [0] = apply_weight_i (dpp->weight_B, sam) + (tmp = bptr [1]);
-                update_weight (dpp->weight_B, dpp->delta, sam, tmp);
-            }
-
-            break;
-
-        default:
-            for (m = 0, k = dpp->term & (MAX_TERM - 1), bptr = buffer; bptr < eptr; bptr += 2) {
-                int32_t sam;
-
-                sam = dpp->samples_A [m];
-                dpp->samples_A [k] = apply_weight_i (dpp->weight_A, sam) + bptr [0];
-                update_weight (dpp->weight_A, dpp->delta, sam, bptr [0]);
-                bptr [0] = dpp->samples_A [k];
-
-                sam = dpp->samples_B [m];
-                dpp->samples_B [k] = apply_weight_i (dpp->weight_B, sam) + bptr [1];
-                update_weight (dpp->weight_B, dpp->delta, sam, bptr [1]);
-                bptr [1] = dpp->samples_B [k];
-
-                m = (m + 1) & (MAX_TERM - 1);
-                k = (k + 1) & (MAX_TERM - 1);
-            }
-
-            break;
-
-        case -1:
-            for (bptr = buffer; bptr < eptr; bptr += 2) {
-                int32_t sam;
-
-                sam = bptr [0] + apply_weight_i (dpp->weight_A, dpp->samples_A [0]);
-                update_weight_clip (dpp->weight_A, dpp->delta, dpp->samples_A [0], bptr [0]);
-                bptr [0] = sam;
-                dpp->samples_A [0] = bptr [1] + apply_weight_i (dpp->weight_B, sam);
-                update_weight_clip (dpp->weight_B, dpp->delta, sam, bptr [1]);
-                bptr [1] = dpp->samples_A [0];
-            }
-
-            break;
-
-        case -2:
-            for (bptr = buffer; bptr < eptr; bptr += 2) {
-                int32_t sam;
-
-                sam = bptr [1] + apply_weight_i (dpp->weight_B, dpp->samples_B [0]);
-                update_weight_clip (dpp->weight_B, dpp->delta, dpp->samples_B [0], bptr [1]);
-                bptr [1] = sam;
-                dpp->samples_B [0] = bptr [0] + apply_weight_i (dpp->weight_A, sam);
-                update_weight_clip (dpp->weight_A, dpp->delta, sam, bptr [0]);
-                bptr [0] = dpp->samples_B [0];
-            }
-
-            break;
-
-        case -3:
-            for (bptr = buffer; bptr < eptr; bptr += 2) {
-                int32_t sam_A, sam_B;
-
-                sam_A = bptr [0] + apply_weight_i (dpp->weight_A, dpp->samples_A [0]);
-                update_weight_clip (dpp->weight_A, dpp->delta, dpp->samples_A [0], bptr [0]);
-                sam_B = bptr [1] + apply_weight_i (dpp->weight_B, dpp->samples_B [0]);
-                update_weight_clip (dpp->weight_B, dpp->delta, dpp->samples_B [0], bptr [1]);
-                bptr [0] = dpp->samples_B [0] = sam_A;
-                bptr [1] = dpp->samples_A [0] = sam_B;
-            }
-
-            break;
-    }
-}
-
-// These functions are specialized versions of decorr_stereo_pass() that work
-// only with lower resolution data (<= 16-bit) and handle the equivalent of
-// *two* decorrelation passes. By combining two passes we save a read and write
-// of the sample data and some overhead dealing with buffer pointers and looping.
-//
-// The cases handled are:
-//     17,17 -- standard "fast" mode before version 4.40
-//     17,18 -- standard "fast" mode starting with 4.40
-//     18,18 -- used in the default and higher modes
-//     [1-7],[1-7] -- common in "high" and "very high" modes
-
-static void decorr_stereo_pass_1718 (struct decorr_pass *dpp, int32_t *buffer, int32_t sample_count)
-{
-    int32_t *bptr, *eptr = buffer + (sample_count * 2);
-
-    for (bptr = buffer; bptr < eptr; bptr += 2) {
-        int32_t sam;
-
-        sam = 2 * dpp->samples_A [0] - dpp->samples_A [1];
-        dpp->samples_A [1] = dpp->samples_A [0];
-        dpp->samples_A [0] = apply_weight_i (dpp->weight_A, sam) + bptr [0];
-        update_weight (dpp->weight_A, dpp->delta, sam, bptr [0]);
-
-        sam = (dpp+1)->samples_A [0] + (((dpp+1)->samples_A [0] - (dpp+1)->samples_A [1]) >> 1);
-        (dpp+1)->samples_A [1] = (dpp+1)->samples_A [0];
-        bptr [0] = (dpp+1)->samples_A [0] = apply_weight_i ((dpp+1)->weight_A, sam) + dpp->samples_A [0];
-        update_weight ((dpp+1)->weight_A, (dpp+1)->delta, sam, dpp->samples_A [0]);
-
-        sam = 2 * dpp->samples_B [0] - dpp->samples_B [1];
-        dpp->samples_B [1] = dpp->samples_B [0];
-        dpp->samples_B [0] = apply_weight_i (dpp->weight_B, sam) + bptr [1];
-        update_weight (dpp->weight_B, dpp->delta, sam, bptr [1]);
-
-        sam = (dpp+1)->samples_B [0] + (((dpp+1)->samples_B [0] - (dpp+1)->samples_B [1]) >> 1);
-        (dpp+1)->samples_B [1] = (dpp+1)->samples_B [0];
-        bptr [1] = (dpp+1)->samples_B [0] = apply_weight_i ((dpp+1)->weight_B, sam) + dpp->samples_B [0];
-        update_weight ((dpp+1)->weight_B, (dpp+1)->delta, sam, dpp->samples_B [0]);
-    }
-}
-
-static void decorr_stereo_pass_1717 (struct decorr_pass *dpp, int32_t *buffer, int32_t sample_count)
-{
-    int32_t *bptr, *eptr = buffer + (sample_count * 2);
-
-    for (bptr = buffer; bptr < eptr; bptr += 2) {
-        int32_t sam;
-
-        sam = 2 * dpp->samples_A [0] - dpp->samples_A [1];
-        dpp->samples_A [1] = dpp->samples_A [0];
-        dpp->samples_A [0] = apply_weight_i (dpp->weight_A, sam) + bptr [0];
-        update_weight (dpp->weight_A, dpp->delta, sam, bptr [0]);
-
-        sam = 2 * (dpp+1)->samples_A [0] - (dpp+1)->samples_A [1];
-        (dpp+1)->samples_A [1] = (dpp+1)->samples_A [0];
-        bptr [0] = (dpp+1)->samples_A [0] = apply_weight_i ((dpp+1)->weight_A, sam) + dpp->samples_A [0];
-        update_weight ((dpp+1)->weight_A, (dpp+1)->delta, sam, dpp->samples_A [0]);
-
-        sam = 2 * dpp->samples_B [0] - dpp->samples_B [1];
-        dpp->samples_B [1] = dpp->samples_B [0];
-        dpp->samples_B [0] = apply_weight_i (dpp->weight_B, sam) + bptr [1];
-        update_weight (dpp->weight_B, dpp->delta, sam, bptr [1]);
-
-        sam = 2 * (dpp+1)->samples_B [0] - (dpp+1)->samples_B [1];
-        (dpp+1)->samples_B [1] = (dpp+1)->samples_B [0];
-        bptr [1] = (dpp+1)->samples_B [0] = apply_weight_i ((dpp+1)->weight_B, sam) + dpp->samples_B [0];
-        update_weight ((dpp+1)->weight_B, (dpp+1)->delta, sam, dpp->samples_B [0]);
-    }
-}
-
-static void decorr_stereo_pass_1818 (struct decorr_pass *dpp, int32_t *buffer, int32_t sample_count)
-{
-    int32_t *bptr, *eptr = buffer + (sample_count * 2);
-
-    for (bptr = buffer; bptr < eptr; bptr += 2) {
-        int32_t sam;
-
-        sam = dpp->samples_A [0] + ((dpp->samples_A [0] - dpp->samples_A [1]) >> 1);
-        dpp->samples_A [1] = dpp->samples_A [0];
-        dpp->samples_A [0] = apply_weight_i (dpp->weight_A, sam) + bptr [0];
-        update_weight (dpp->weight_A, dpp->delta, sam, bptr [0]);
-
-        sam = (dpp+1)->samples_A [0] + (((dpp+1)->samples_A [0] - (dpp+1)->samples_A [1]) >> 1);
-        (dpp+1)->samples_A [1] = (dpp+1)->samples_A [0];
-        bptr [0] = (dpp+1)->samples_A [0] = apply_weight_i ((dpp+1)->weight_A, sam) + dpp->samples_A [0];
-        update_weight ((dpp+1)->weight_A, (dpp+1)->delta, sam, dpp->samples_A [0]);
-
-        sam = dpp->samples_B [0] + ((dpp->samples_B [0] - dpp->samples_B [1]) >> 1);
-        dpp->samples_B [1] = dpp->samples_B [0];
-        dpp->samples_B [0] = apply_weight_i (dpp->weight_B, sam) + bptr [1];
-        update_weight (dpp->weight_B, dpp->delta, sam, bptr [1]);
-
-        sam = (dpp+1)->samples_B [0] + (((dpp+1)->samples_B [0] - (dpp+1)->samples_B [1]) >> 1);
-        (dpp+1)->samples_B [1] = (dpp+1)->samples_B [0];
-        bptr [1] = (dpp+1)->samples_B [0] = apply_weight_i ((dpp+1)->weight_B, sam) + dpp->samples_B [0];
-        update_weight ((dpp+1)->weight_B, (dpp+1)->delta, sam, dpp->samples_B [0]);
-    }
-}
-
-static void decorr_stereo_pass_nn (struct decorr_pass *dpp, int32_t *buffer, int32_t sample_count)
-{
-    int32_t *bptr, *eptr = buffer + (sample_count * 2);
-    int m, k, j;
-
-    m = 0;
-    k = dpp->term & (MAX_TERM - 1);
-    j = (dpp+1)->term & (MAX_TERM - 1);
-
-    for (bptr = buffer; bptr < eptr; bptr += 2) {
-        int32_t tmp;
-
-        dpp->samples_A [k] = apply_weight_i (dpp->weight_A, dpp->samples_A [m]) + (tmp = bptr [0]);
-        update_weight (dpp->weight_A, dpp->delta, dpp->samples_A [m], tmp);
-
-        bptr [0] = (dpp+1)->samples_A [j] = apply_weight_i ((dpp+1)->weight_A, (dpp+1)->samples_A [m]) + (tmp = dpp->samples_A [k]);
-        update_weight ((dpp+1)->weight_A, (dpp+1)->delta, (dpp+1)->samples_A [m], tmp);
-
-        dpp->samples_B [k] = apply_weight_i (dpp->weight_B, dpp->samples_B [m]) + (tmp = bptr [1]);
-        update_weight (dpp->weight_B, dpp->delta, dpp->samples_B [m], tmp);
-
-        bptr [1] = (dpp+1)->samples_B [j] = apply_weight_i ((dpp+1)->weight_B, (dpp+1)->samples_B [m]) + (tmp = dpp->samples_B [k]);
-        update_weight ((dpp+1)->weight_B, (dpp+1)->delta, (dpp+1)->samples_B [m], tmp);
-
-        m = (m + 1) & (MAX_TERM - 1);
-        k = (k + 1) & (MAX_TERM - 1);
-        j = (j + 1) & (MAX_TERM - 1);
-    }
-}
-
-#endif
-
 // This is a helper function for unpack_samples() that applies several final
 // operations. First, if the data is 32-bit float data, then that conversion
 // is done in the float.c module (whether lossy or lossless) and we return.
@@ -1413,5 +815,3 @@ int check_crc_error (WavpackContext *wpc)
 
     return result;
 }
-
-#endif
diff --git a/third_party/wavpack/src/unpack3.c b/third_party/wavpack/src/unpack3.c
index d877a79..dccf085 100644
--- a/third_party/wavpack/src/unpack3.c
+++ b/third_party/wavpack/src/unpack3.c
@@ -1,7 +1,7 @@
 ////////////////////////////////////////////////////////////////////////////
 //                           **** WAVPACK ****                            //
 //                  Hybrid Lossless Wavefile Compressor                   //
-//              Copyright (c) 1998 - 2006 Conifer Software.               //
+//              Copyright (c) 1998 - 2013 Conifer Software.               //
 //                          All Rights Reserved.                          //
 //      Distributed under the BSD Software License (see license.txt)      //
 ////////////////////////////////////////////////////////////////////////////
@@ -12,387 +12,24 @@
 // not including "raw" files. As these modes are all obsolete and are no
 // longer written, this code will not be fully documented other than the
 // global functions. However, full documenation is provided in the version
-// 3.97 source code.
+// 3.97 source code. Note that this module does only the low-level sample
+// unpacking; the actual opening of the file (and obtaining information
+// from it) is handled in the unpack3_open.c module.
+
+#ifdef ENABLE_LEGACY
 
 #include <stdlib.h>
-#include <stdio.h>
 #include <string.h>
-#include <math.h>
 
 #include "wavpack_local.h"
 #include "unpack3.h"
 
 #define ATTEMPT_ERROR_MUTING
 
-#ifdef DEBUG_ALLOC
-#define malloc malloc_db
-#define realloc realloc_db
-#define free free_db
-void *malloc_db (uint32_t size);
-void *realloc_db (void *ptr, uint32_t size);
-void free_db (void *ptr);
-int32_t dump_alloc (void);
-#endif
-
-static void unpack_init3 (WavpackStream3 *wps);
-static int bs_open_read3 (Bitstream3 *bs, WavpackStreamReader *reader, void *id);
-static void bs_close_read3 (Bitstream3 *bs);
-#ifndef NO_SEEKING
-static void bs_restore3 (Bitstream3 *bs);
-#endif
-
-// This provides an extension to the WavpackOpenFileRead () function contained
-// in the wputils.c module. It is assumed that an 'R' had been read as the
-// first character of the file/stream (indicating a non-raw pre version 4.0
-// WavPack file) and had been pushed back onto the stream (or simply seeked
-// back to).
-
-WavpackContext *open_file3 (WavpackContext *wpc, char *error)
-{
-    RiffChunkHeader RiffChunkHeader;
-    ChunkHeader ChunkHeader;
-    WavpackHeader3 wphdr;
-    WavpackStream3 *wps;
-    WaveHeader3 wavhdr;
-
-    CLEAR (wavhdr);
-    wpc->stream3 = wps = (WavpackStream3 *) malloc (sizeof (WavpackStream3));
-    CLEAR (*wps);
-
-    if (wpc->reader->read_bytes (wpc->wv_in, &RiffChunkHeader, sizeof (RiffChunkHeader)) !=
-        sizeof (RiffChunkHeader)) {
-            if (error) strcpy (error, "not a valid WavPack file!");
-            return WavpackCloseFile (wpc);
-    }
-
-    if (!strncmp (RiffChunkHeader.ckID, "RIFF", 4) && !strncmp (RiffChunkHeader.formType, "WAVE", 4)) {
-
-        if (wpc->open_flags & OPEN_WRAPPER) {
-            wpc->wrapper_data = malloc (wpc->wrapper_bytes = sizeof (RiffChunkHeader));
-            memcpy (wpc->wrapper_data, &RiffChunkHeader, sizeof (RiffChunkHeader));
-        }
-
-    // If the first chunk is a wave RIFF header, then read the various chunks
-    // until we get to the "data" chunk (and WavPack header should follow). If
-    // the first chunk is not a RIFF, then we assume a "raw" WavPack file and
-    // the WavPack header must be first.
-
-        while (1) {
-
-            if (wpc->reader->read_bytes (wpc->wv_in, &ChunkHeader, sizeof (ChunkHeader)) !=
-                sizeof (ChunkHeader)) {
-                    if (error) strcpy (error, "not a valid WavPack file!");
-                    return WavpackCloseFile (wpc);
-            }
-            else {
-                if (wpc->open_flags & OPEN_WRAPPER) {
-                    wpc->wrapper_data = realloc (wpc->wrapper_data, wpc->wrapper_bytes + sizeof (ChunkHeader));
-                    memcpy (wpc->wrapper_data + wpc->wrapper_bytes, &ChunkHeader, sizeof (ChunkHeader));
-                    wpc->wrapper_bytes += sizeof (ChunkHeader);
-                }
-
-                little_endian_to_native (&ChunkHeader, ChunkHeaderFormat);
-
-                if (!strncmp (ChunkHeader.ckID, "fmt ", 4)) {
-
-                    if (ChunkHeader.ckSize < sizeof (wavhdr) ||
-                        wpc->reader->read_bytes (wpc->wv_in, &wavhdr, sizeof (wavhdr)) != sizeof (wavhdr)) {
-                            if (error) strcpy (error, "not a valid WavPack file!");
-                            return WavpackCloseFile (wpc);
-                    }
-                    else if (wpc->open_flags & OPEN_WRAPPER) {
-                        wpc->wrapper_data = realloc (wpc->wrapper_data, wpc->wrapper_bytes + sizeof (wavhdr));
-                        memcpy (wpc->wrapper_data + wpc->wrapper_bytes, &wavhdr, sizeof (wavhdr));
-                        wpc->wrapper_bytes += sizeof (wavhdr);
-                    }
-
-                    little_endian_to_native (&wavhdr, WaveHeader3Format);
-
-                    if (ChunkHeader.ckSize > sizeof (wavhdr)) {
-                        uint32_t bytes_to_skip = (ChunkHeader.ckSize + 1 - sizeof (wavhdr)) & ~1L;
-
-                        if (bytes_to_skip > 1024 * 1024) {
-                            if (error) strcpy (error, "not a valid WavPack file!");
-                            return WavpackCloseFile (wpc);
-                        }
-
-                        if (wpc->open_flags & OPEN_WRAPPER) {
-                            wpc->wrapper_data = realloc (wpc->wrapper_data, wpc->wrapper_bytes + bytes_to_skip);
-                            wpc->reader->read_bytes (wpc->wv_in, wpc->wrapper_data + wpc->wrapper_bytes, bytes_to_skip);
-                            wpc->wrapper_bytes += bytes_to_skip;
-                        }
-                        else {
-                            unsigned char *temp = malloc (bytes_to_skip);
-                            wpc->reader->read_bytes (wpc->wv_in, temp, bytes_to_skip);
-                            free (temp);
-                        }
-                    }
-                }
-                else if (!strncmp (ChunkHeader.ckID, "data", 4))
-                    break;
-                else if ((ChunkHeader.ckSize + 1) & ~1L) {
-                    uint32_t bytes_to_skip = (ChunkHeader.ckSize + 1) & ~1L;
-
-                    if (bytes_to_skip > 1024 * 1024) {
-                        if (error) strcpy (error, "not a valid WavPack file!");
-                        return WavpackCloseFile (wpc);
-                    }
-
-                    if (wpc->open_flags & OPEN_WRAPPER) {
-                        wpc->wrapper_data = realloc (wpc->wrapper_data, wpc->wrapper_bytes + bytes_to_skip);
-                        wpc->reader->read_bytes (wpc->wv_in, wpc->wrapper_data + wpc->wrapper_bytes, bytes_to_skip);
-                        wpc->wrapper_bytes += bytes_to_skip;
-                    }
-                    else {
-                        unsigned char *temp = malloc (bytes_to_skip);
-                        wpc->reader->read_bytes (wpc->wv_in, temp, bytes_to_skip);
-                        free (temp);
-                    }
-                }
-            }
-        }
-    }
-    else {
-        if (error) strcpy (error, "not a valid WavPack file!");
-        return WavpackCloseFile (wpc);
-    }
-
-    if (wavhdr.FormatTag != 1 || !wavhdr.NumChannels || wavhdr.NumChannels > 2 ||
-        !wavhdr.SampleRate || wavhdr.BitsPerSample < 16 || wavhdr.BitsPerSample > 24 ||
-        wavhdr.BlockAlign / wavhdr.NumChannels > 3 || wavhdr.BlockAlign % wavhdr.NumChannels ||
-        wavhdr.BlockAlign / wavhdr.NumChannels < (wavhdr.BitsPerSample + 7) / 8) {
-            if (error) strcpy (error, "not a valid WavPack file!");
-            return WavpackCloseFile (wpc);
-    }
-
-    wpc->total_samples = ChunkHeader.ckSize / wavhdr.NumChannels /
-        ((wavhdr.BitsPerSample > 16) ? 3 : 2);
-
-    if (wpc->reader->read_bytes (wpc->wv_in, &wphdr, 10) != 10) {
-        if (error) strcpy (error, "not a valid WavPack file!");
-        return WavpackCloseFile (wpc);
-    }
-
-    if (((char *) &wphdr) [8] == 2 && (wpc->reader->read_bytes (wpc->wv_in, ((char *) &wphdr) + 10, 2) != 2)) {
-        if (error) strcpy (error, "not a valid WavPack file!");
-        return WavpackCloseFile (wpc);
-    }
-    else if (((char *) &wphdr) [8] == 3 && (wpc->reader->read_bytes (wpc->wv_in, ((char *) &wphdr) + 10,
-        sizeof (wphdr) - 10) != sizeof (wphdr) - 10)) {
-            if (error) strcpy (error, "not a valid WavPack file!");
-            return WavpackCloseFile (wpc);
-    }
-
-    little_endian_to_native (&wphdr, WavpackHeader3Format);
-
-    // make sure this is a version we know about
-
-    if (strncmp (wphdr.ckID, "wvpk", 4) || wphdr.version < 1 || wphdr.version > 3) {
-        if (error) strcpy (error, "not a valid WavPack file!");
-        return WavpackCloseFile (wpc);
-    }
-
-    // Because I ran out of flag bits in the WavPack header, an amazingly ugly
-    // kludge was forced upon me! This code takes care of preparing the flags
-    // field for internal use and checking for unknown formats we can't decode
-
-    if (wphdr.version == 3) {
-
-        if (wphdr.flags & EXTREME_DECORR) {
-
-            if ((wphdr.flags & NOT_STORED_FLAGS) ||
-                ((wphdr.bits) &&
-                (((wphdr.flags & NEW_HIGH_FLAG) &&
-                (wphdr.flags & (FAST_FLAG | HIGH_FLAG))) ||
-                (wphdr.flags & CROSS_DECORR)))) {
-                    if (error) strcpy (error, "not a valid WavPack file!");
-                    return WavpackCloseFile (wpc);
-            }
-
-            if (wphdr.flags & CANCEL_EXTREME)
-                wphdr.flags &= ~(EXTREME_DECORR | CANCEL_EXTREME);
-        }
-        else
-            wphdr.flags &= ~CROSS_DECORR;
-    }
-
-    // check to see if we should look for a "correction" file, and if so try
-    // to open it for reading, then set WVC_FLAG accordingly
-
-    if (wpc->wvc_in && wphdr.version == 3 && wphdr.bits && (wphdr.flags & NEW_HIGH_FLAG)) {
-        wpc->file2len = wpc->reader->get_length (wpc->wvc_in);
-        wphdr.flags |= WVC_FLAG;
-        wpc->wvc_flag = TRUE;
-    }
-    else
-        wphdr.flags &= ~WVC_FLAG;
-
-    // check WavPack version to handle special requirements of versions
-    // before 3.0 that had smaller headers
-
-    if (wphdr.version < 3) {
-        wphdr.total_samples = wpc->total_samples;
-        wphdr.flags = wavhdr.NumChannels == 1 ? MONO_FLAG : 0;
-        wphdr.shift = 16 - wavhdr.BitsPerSample;
-
-        if (wphdr.version == 1)
-            wphdr.bits = 0;
-    }
-
-    wpc->config.sample_rate = wavhdr.SampleRate;
-    wpc->config.num_channels = wavhdr.NumChannels;
-    wpc->config.channel_mask = 5 - wavhdr.NumChannels;
-
-    if (wphdr.flags & MONO_FLAG)
-        wpc->config.flags |= CONFIG_MONO_FLAG;
-
-    if (wphdr.flags & EXTREME_DECORR)
-        wpc->config.flags |= CONFIG_HIGH_FLAG;
-
-    if (wphdr.bits) {
-        if (wphdr.flags & NEW_HIGH_FLAG)
-            wpc->config.flags |= CONFIG_HYBRID_FLAG;
-        else
-            wpc->config.flags |= CONFIG_LOSSY_MODE;
-    }
-    else if (!(wphdr.flags & HIGH_FLAG))
-        wpc->config.flags |= CONFIG_FAST_FLAG;
-
-    wpc->config.bytes_per_sample = (wphdr.flags & BYTES_3) ? 3 : 2;
-    wpc->config.bits_per_sample = wavhdr.BitsPerSample;
-
-    memcpy (&wps->wphdr, &wphdr, sizeof (wphdr));
-    wps->wvbits.bufsiz = wps->wvcbits.bufsiz = 1024 * 1024;
-    return wpc;
-}
-
-// return currently decoded sample index
-
-uint32_t get_sample_index3 (WavpackContext *wpc)
-{
-    WavpackStream3 *wps = (WavpackStream3 *) wpc->stream3;
-
-    return (wps) ? wps->sample_index : (uint32_t) -1;
-}
-
-int get_version3 (WavpackContext *wpc)
-{
-    WavpackStream3 *wps = (WavpackStream3 *) wpc->stream3;
-
-    return (wps) ? wps->wphdr.version : 0;
-}
-
-void free_stream3 (WavpackContext *wpc)
-{
-    WavpackStream3 *wps = (WavpackStream3 *) wpc->stream3;
-
-    if (wps) {
-#ifndef NO_SEEKING
-        if (wps->unpack_data)
-            free (wps->unpack_data);
-#endif
-        if (wps->wphdr.flags & WVC_FLAG)
-            bs_close_read3 (&wps->wvcbits);
-
-        bs_close_read3 (&wps->wvbits);
-
-        free (wps);
-    }
-}
-
-static void bs_read3 (Bitstream3 *bs)
-{
-    uint32_t bytes_read;
-
-    bytes_read = bs->reader->read_bytes (bs->id, bs->buf, bs->bufsiz);
-    bs->end = bs->buf + bytes_read;
-    bs->fpos += bytes_read;
-
-    if (bs->end == bs->buf) {
-        memset (bs->buf, -1, bs->bufsiz);
-        bs->end += bs->bufsiz;
-    }
-
-    bs->ptr = bs->buf;
-}
-
-// Open the specified BitStream and associate with the specified file. The
-// "bufsiz" field of the structure must be preset with the desired buffer
-// size and the file's read pointer must be set to where the desired bit
-// data is located.  A return value of TRUE indicates an error in
-// allocating buffer space.
-
-static int bs_open_read3 (Bitstream3 *bs, WavpackStreamReader *reader, void *id)
-{
-    bs->fpos = (bs->reader = reader)->get_pos (bs->id = id);
-
-    if (!bs->buf)
-        bs->buf = (unsigned char *) malloc (bs->bufsiz);
-
-    bs->end = bs->buf + bs->bufsiz;
-    bs->ptr = bs->end - 1;
-    bs->sr = bs->bc = 0;
-    bs->error = bs->buf ? 0 : 1;
-    bs->wrap = bs_read3;
-    return bs->error;
-}
-
-#ifndef NO_SEEKING
-
-// This function is called after a call to unpack_restore() has restored
-// the BitStream structure to a previous state and causes any required data
-// to be read from the file. This function is NOT supported for overlapped
-// operation.
-
-static void bs_restore3 (Bitstream3 *bs)
-{
-    uint32_t bytes_to_read = (uint32_t)(bs->end - bs->ptr - 1), bytes_read;
-
-    bs->reader->set_pos_abs (bs->id, bs->fpos - bytes_to_read);
-
-    if (bytes_to_read > 0) {
-
-        bytes_read = bs->reader->read_bytes (bs->id, bs->ptr + 1, bytes_to_read);
-
-        if (bytes_to_read != bytes_read)
-            bs->end = bs->ptr + 1 + bytes_read;
-    }
-}
-
-#endif
-
-// This function is called to release any resources used by the BitStream
-// and position the file pointer to the first byte past the read bits.
-
-static void bs_close_read3 (Bitstream3 *bs)
-{
-    if (bs->buf) {
-        free (bs->buf);
-        CLEAR (*bs);
-    }
-}
-
-static uint32_t bs_unused_bytes (Bitstream3 *bs)
-{
-    if (bs->bc < 8) {
-        bs->bc += 8;
-        bs->ptr++;
-    }
-
-    return (uint32_t)(bs->end - bs->ptr);
-}
-
-static unsigned char *bs_unused_data (Bitstream3 *bs)
-{
-    if (bs->bc < 8) {
-        bs->bc += 8;
-        bs->ptr++;
-    }
-
-    return bs->ptr;
-}
-
-#ifndef NO_UNPACK
+static int bs_open_read3 (Bitstream3 *bs, WavpackStreamReader64 *reader, void *id);
+static uint32_t bs_unused_bytes (Bitstream3 *bs);
+static unsigned char *bs_unused_data (Bitstream3 *bs);
+static void init_words3 (WavpackStream3 *wps);
 
 //////////////////////////////// local macros /////////////////////////////////
 
@@ -426,13 +63,13 @@ static const signed char extreme_terms [] = { 1,1,1,2,4,-1,1,2,3,6,-2,8,5,7,4,1,
 static const signed char default_terms [] = { 1,1,1,-1,2,1,-2 };
 static const signed char simple_terms []  = { 1,1,1,1 };
 
+///////////////////////////// executable code ////////////////////////////////
+
 // This function initializes everything required to unpack WavPack
 // bitstreams and must be called before any unpacking is performed. Note
 // that the (WavpackHeader3 *) in the WavpackStream3 struct must be valid.
 
-static void init_words3 (WavpackStream3 *wps);
-
-static void unpack_init3 (WavpackStream3 *wps)
+void unpack_init3 (WavpackStream3 *wps)
 {
     int flags = wps->wphdr.flags;
     struct decorr_pass *dpp;
@@ -461,9 +98,6 @@ static void unpack_init3 (WavpackStream3 *wps)
 
 #ifndef NO_SEEKING
 
-#define SAVE(destin, item) { memcpy (destin, &item, sizeof (item)); destin = (char *) destin + sizeof (item); }
-#define RESTORE(item, source) { memcpy (&item, source, sizeof (item)); source = (char *) source + sizeof (item); }
-
 // This function returns the size (in bytes) required to save the unpacking
 // context. Note that the (WavpackHeader3 *) in the WavpackStream3 struct
 // must be valid.
@@ -504,7 +138,7 @@ static int unpack_size (WavpackStream3 *wps)
     }
 
     if (flags & (HIGH_FLAG | NEW_HIGH_FLAG))
-        for (tcount = wps->num_terms, dpp = wps->decorr_passes; tcount--; dpp++)
+        for (tcount = wps->num_terms, dpp = wps->decorr_passes; tcount--; dpp++) {
             if (dpp->term > 0) {
                 byte_sum += sizeof (dpp->samples_A [0]) * dpp->term;
                 byte_sum += sizeof (dpp->weight_A);
@@ -518,6 +152,7 @@ static int unpack_size (WavpackStream3 *wps)
                 byte_sum += sizeof (dpp->samples_A [0]) + sizeof (dpp->samples_B [0]);
                 byte_sum += sizeof (dpp->weight_A) + sizeof (dpp->weight_B);
             }
+        }
 
     return byte_sum;
 }
@@ -572,7 +207,7 @@ static void *unpack_save (WavpackStream3 *wps, void *destin)
     }
 
     if (flags & (HIGH_FLAG | NEW_HIGH_FLAG))
-        for (tcount = wps->num_terms, dpp = wps->decorr_passes; tcount--; dpp++)
+        for (tcount = wps->num_terms, dpp = wps->decorr_passes; tcount--; dpp++) {
             if (dpp->term > 0) {
                 int count = dpp->term;
                 int index = wps->dc.m;
@@ -602,177 +237,11 @@ static void *unpack_save (WavpackStream3 *wps, void *destin)
                 SAVE (destin, dpp->samples_A [0]);
                 SAVE (destin, dpp->samples_B [0]);
             }
+        }
 
     return destin;
 }
 
-// This function restores the unpacking context from the specified pointer
-// and returns the updated pointer. After this call, unpack_samples() will
-// continue where it left off immediately before unpack_save() was called.
-// If the WavPack files and bitstreams might have been closed and reopened,
-// then the "keep_resources" flag should be set to avoid using the "old"
-// resources that were originally saved (and are probably now invalid).
-
-static void *unpack_restore (WavpackStream3 *wps, void *source, int keep_resources)
-{
-    int flags = wps->wphdr.flags, tcount;
-    struct decorr_pass *dpp;
-    FILE *temp_file;
-    unsigned char *temp_buf;
-
-    unpack_init3 (wps);
-    temp_file = wps->wvbits.id;
-    temp_buf = wps->wvbits.buf;
-    RESTORE (wps->wvbits, source);
-
-    if (keep_resources) {
-        wps->wvbits.id = temp_file;
-        wps->wvbits.ptr += temp_buf - wps->wvbits.buf;
-        wps->wvbits.end += temp_buf - wps->wvbits.buf;
-        wps->wvbits.buf = temp_buf;
-    }
-
-    bs_restore3 (&wps->wvbits);
-
-    if (flags & WVC_FLAG) {
-        temp_file = wps->wvcbits.id;
-        temp_buf = wps->wvcbits.buf;
-        RESTORE (wps->wvcbits, source);
-
-        if (keep_resources) {
-            wps->wvcbits.id = temp_file;
-            wps->wvcbits.ptr += temp_buf - wps->wvcbits.buf;
-            wps->wvcbits.end += temp_buf - wps->wvcbits.buf;
-            wps->wvcbits.buf = temp_buf;
-        }
-
-        bs_restore3 (&wps->wvcbits);
-    }
-
-    if (wps->wphdr.version == 3) {
-        if (wps->wphdr.bits) {
-            RESTORE (wps->w4, source);
-        }
-        else {
-            RESTORE (wps->w1, source);
-        }
-
-        RESTORE (wps->w3, source);
-        RESTORE (wps->dc.crc, source);
-    }
-    else
-        RESTORE (wps->w2, source);
-
-    if (wps->wphdr.bits) {
-        RESTORE (wps->dc.error, source);
-    }
-    else {
-        RESTORE (wps->dc.sum_level, source);
-        RESTORE (wps->dc.left_level, source);
-        RESTORE (wps->dc.right_level, source);
-        RESTORE (wps->dc.diff_level, source);
-    }
-
-    if (flags & OVER_20) {
-        RESTORE (wps->dc.last_extra_bits, source);
-        RESTORE (wps->dc.extra_bits_count, source);
-    }
-
-    if (!(flags & EXTREME_DECORR)) {
-        RESTORE (wps->dc.sample, source);
-        RESTORE (wps->dc.weight, source);
-    }
-
-    if (flags & (HIGH_FLAG | NEW_HIGH_FLAG))
-        for (tcount = wps->num_terms, dpp = wps->decorr_passes; tcount--; dpp++)
-            if (dpp->term > 0) {
-                int count = dpp->term;
-                int index = wps->dc.m;
-
-                RESTORE (dpp->weight_A, source);
-
-                while (count--) {
-                    RESTORE (dpp->samples_A [index], source);
-                    index = (index + 1) & (MAX_TERM - 1);
-                }
-
-                if (!(flags & MONO_FLAG)) {
-                    count = dpp->term;
-                    index = wps->dc.m;
-
-                    RESTORE (dpp->weight_B, source);
-
-                    while (count--) {
-                        RESTORE (dpp->samples_B [index], source);
-                        index = (index + 1) & (MAX_TERM - 1);
-                    }
-                }
-            }
-            else {
-                RESTORE (dpp->weight_A, source);
-                RESTORE (dpp->weight_B, source);
-                RESTORE (dpp->samples_A [0], source);
-                RESTORE (dpp->samples_B [0], source);
-            }
-
-    return source;
-}
-
-// This is an extension for WavpackSeekSample (). Note that because WavPack
-// files created prior to version 4.0 are not inherently seekable, this
-// function could take a long time if a forward seek is requested to an
-// area that has not been played (or seeked through) yet.
-
-
-int seek_sample3 (WavpackContext *wpc, uint32_t desired_index)
-{
-    int points_index = desired_index / ((wpc->total_samples >> 8) + 1);
-    WavpackStream3 *wps = (WavpackStream3 *) wpc->stream3;
-
-    if (desired_index >= wpc->total_samples)
-        return FALSE;
-
-    while (points_index)
-        if (wps->index_points [points_index].saved &&
-            wps->index_points [points_index].sample_index <= desired_index)
-                break;
-        else
-            points_index--;
-
-    if (wps->index_points [points_index].saved)
-        if (wps->index_points [points_index].sample_index > wps->sample_index ||
-            wps->sample_index > desired_index) {
-                wps->sample_index = wps->index_points [points_index].sample_index;
-                unpack_restore (wps, wps->unpack_data + points_index * wps->unpack_size, TRUE);
-        }
-
-    if (desired_index > wps->sample_index) {
-        int32_t *buffer = (int32_t *) malloc (1024 * (wps->wphdr.flags & MONO_FLAG ? 4 : 8));
-        uint32_t samples_to_skip = desired_index - wps->sample_index;
-
-        while (1) {
-            if (samples_to_skip > 1024) {
-                if (unpack_samples3 (wpc, buffer, 1024) == 1024)
-                    samples_to_skip -= 1024;
-                else
-                    break;
-            }
-            else {
-                samples_to_skip -= unpack_samples3 (wpc, buffer, samples_to_skip);
-                break;
-            }
-        }
-
-        free (buffer);
-
-        if (samples_to_skip)
-            return FALSE;
-    }
-
-    return TRUE;
-}
-
-
 #endif
 
 // This monster actually unpacks the WavPack bitstream(s) into the specified
@@ -797,7 +266,7 @@ int32_t unpack_samples3 (WavpackContext *wpc, int32_t *buffer, uint32_t sample_c
     WavpackStream3 *wps = (WavpackStream3 *) wpc->stream3;
     int shift = wps->wphdr.shift, flags = wps->wphdr.flags, min_weight = 0, m = wps->dc.m, tcount;
 #ifndef NO_SEEKING
-    int points_index = wps->sample_index / ((wpc->total_samples >> 8) + 1);
+    int points_index = wps->sample_index / (((uint32_t) wpc->total_samples >> 8) + 1);
 #endif
     int32_t min_value, max_value, min_shifted, max_shifted;
     int32_t correction [2], crc = wps->dc.crc;
@@ -820,7 +289,7 @@ int32_t unpack_samples3 (WavpackContext *wpc, int32_t *buffer, uint32_t sample_c
 #endif
 
     if (wps->sample_index + sample_count > wpc->total_samples)
-        sample_count = wpc->total_samples - wps->sample_index;
+        sample_count = (uint32_t) (wpc->total_samples - wps->sample_index);
 
     if (!sample_count)
         return 0;
@@ -1705,22 +1174,22 @@ int32_t unpack_samples3 (WavpackContext *wpc, int32_t *buffer, uint32_t sample_c
             wpc->crc_errors++;
 
         if (wpc->open_flags & OPEN_WRAPPER) {
-            unsigned char *temp = malloc (1024);
+            unsigned char *temp = (unsigned char *)malloc (1024);
             uint32_t bcount;
 
             if (bs_unused_bytes (&wps->wvbits)) {
-                wpc->wrapper_data = realloc (wpc->wrapper_data, wpc->wrapper_bytes + bs_unused_bytes (&wps->wvbits));
+                wpc->wrapper_data = (unsigned char *)realloc (wpc->wrapper_data, wpc->wrapper_bytes + bs_unused_bytes (&wps->wvbits));
                 memcpy (wpc->wrapper_data + wpc->wrapper_bytes, bs_unused_data (&wps->wvbits), bs_unused_bytes (&wps->wvbits));
                 wpc->wrapper_bytes += bs_unused_bytes (&wps->wvbits);
             }
 
             while (1) {
-                bcount = wpc->reader->read_bytes (wpc->wv_in, temp, sizeof (temp));
+                bcount = wpc->reader->read_bytes (wpc->wv_in, temp, 1024);
 
                 if (!bcount)
                     break;
 
-                wpc->wrapper_data = realloc (wpc->wrapper_data, wpc->wrapper_bytes + bcount);
+                wpc->wrapper_data = (unsigned char *)realloc (wpc->wrapper_data, wpc->wrapper_bytes + bcount);
                 memcpy (wpc->wrapper_data + wpc->wrapper_bytes, temp, bcount);
                 wpc->wrapper_bytes += bcount;
             }
@@ -1733,7 +1202,7 @@ int32_t unpack_samples3 (WavpackContext *wpc, int32_t *buffer, uint32_t sample_c
                 for (c = 0; c < 16 && wpc->wrapper_data [c] == 0xff; ++c);
 
                 if (c == 16) {
-                    memcpy (wpc->wrapper_data, wpc->wrapper_data + 16, wpc->wrapper_bytes - 16);
+                    memmove (wpc->wrapper_data, wpc->wrapper_data + 16, wpc->wrapper_bytes - 16);
                     wpc->wrapper_bytes -= 16;
                 }
                 else {
@@ -1753,12 +1222,6 @@ int32_t unpack_samples3 (WavpackContext *wpc, int32_t *buffer, uint32_t sample_c
     return i;
 }
 
-///////////////////////////// local table storage ////////////////////////////
-
-extern const uint32_t bitset [];
-extern const uint32_t bitmask [];
-extern const char nbits_table [];
-
 // This function initializes everything required to receive words with this
 // module and must be called BEFORE any other function in this module.
 
@@ -1775,18 +1238,6 @@ static void init_words3 (WavpackStream3 *wps)
         wps->w4.bitrate = (wps->wphdr.bits / 2) - 768;
 }
 
-// This macro counts the number of bits that are required to specify the
-// unsigned 32-bit value, counting from the LSB to the most significant bit
-// that is set. Return range is 0 - 32.
-
-#define count_bits(av) ( \
- (av) < (1 << 8) ? nbits_table [av] : \
-  ( \
-   (av) < (1L << 16) ? nbits_table [(av) >> 8] + 8 : \
-   ((av) < (1L << 24) ? nbits_table [(av) >> 16] + 16 : nbits_table [(av) >> 24] + 24) \
-  ) \
-)
-
 static int32_t FASTCALL get_word1 (WavpackStream3 *wps, int chan)
 {
     uint32_t tmp1, tmp2, avalue;
@@ -2033,7 +1484,7 @@ static int32_t FASTCALL get_word3 (WavpackStream3 *wps, int chan)
     }
 }
 
-static int FASTCALL _log2 (uint32_t avalue);
+static int FASTCALL wp3_log2 (uint32_t avalue);
 
 static int32_t FASTCALL get_word4 (WavpackStream3 *wps, int chan, int32_t *correction)
 {
@@ -2076,22 +1527,22 @@ static int32_t FASTCALL get_word4 (WavpackStream3 *wps, int chan, int32_t *corre
         int slow_log_0, slow_log_1, balance;
 
         if (wps->wphdr.flags & MONO_FLAG) {
-            wps->w4.bits_acc [0] += wps->w4.bitrate + _log2 (wps->w4.fast_level [0]) - _log2 (wps->w4.slow_level [0]) + (3 << 8);
+            wps->w4.bits_acc [0] += wps->w4.bitrate + wp3_log2 (wps->w4.fast_level [0]) - wp3_log2 (wps->w4.slow_level [0]) + (3 << 8);
 
             if (wps->w4.bits_acc [0] < 0)
                 wps->w4.bits_acc [0] = 0;
         }
         else {
-            slow_log_0 = _log2 (wps->w4.slow_level [0]);
-            slow_log_1 = _log2 (wps->w4.slow_level [1]);
+            slow_log_0 = wp3_log2 (wps->w4.slow_level [0]);
+            slow_log_1 = wp3_log2 (wps->w4.slow_level [1]);
 
             if (wps->wphdr.flags & JOINT_STEREO)
                 balance = (slow_log_1 - slow_log_0 + 257) >> 1;
             else
                 balance = (slow_log_1 - slow_log_0 + 1) >> 1;
 
-            wps->w4.bits_acc [0] += wps->w4.bitrate - balance + _log2 (wps->w4.fast_level [0]) - slow_log_0 + (3 << 8);
-            wps->w4.bits_acc [1] += wps->w4.bitrate + balance + _log2 (wps->w4.fast_level [1]) - slow_log_1 + (3 << 8);
+            wps->w4.bits_acc [0] += wps->w4.bitrate - balance + wp3_log2 (wps->w4.fast_level [0]) - slow_log_0 + (3 << 8);
+            wps->w4.bits_acc [1] += wps->w4.bitrate + balance + wp3_log2 (wps->w4.fast_level [1]) - slow_log_1 + (3 << 8);
 
             if (wps->w4.bits_acc [0] + wps->w4.bits_acc [1] < 0)
                 wps->w4.bits_acc [0] = wps->w4.bits_acc [1] = 0;
@@ -2171,7 +1622,7 @@ static int32_t FASTCALL get_word4 (WavpackStream3 *wps, int chan, int32_t *corre
 // fraction) from the supplied value. Using logarithms makes comparing
 // signal level values and calculating fractional bitrates much easier.
 
-static int FASTCALL _log2 (uint32_t avalue)
+static int FASTCALL wp3_log2 (uint32_t avalue)
 {
     int dbits;
 
@@ -2191,5 +1642,62 @@ static int FASTCALL _log2 (uint32_t avalue)
     }
 }
 
-#endif
+static void bs_read3 (Bitstream3 *bs)
+{
+    uint32_t bytes_read;
+
+    bytes_read = bs->reader->read_bytes (bs->id, bs->buf, bs->bufsiz);
+    bs->end = bs->buf + bytes_read;
+    bs->fpos += bytes_read;
+
+    if (bs->end == bs->buf) {
+        memset (bs->buf, -1, bs->bufsiz);
+        bs->end += bs->bufsiz;
+    }
+
+    bs->ptr = bs->buf;
+}
+
+// Open the specified BitStream and associate with the specified file. The
+// "bufsiz" field of the structure must be preset with the desired buffer
+// size and the file's read pointer must be set to where the desired bit
+// data is located.  A return value of TRUE indicates an error in
+// allocating buffer space.
+
+static int bs_open_read3 (Bitstream3 *bs, WavpackStreamReader64 *reader, void *id)
+{
+    bs->fpos = (bs->reader = reader)->get_pos (bs->id = id);
+
+    if (!bs->buf)
+        bs->buf = (unsigned char *) malloc (bs->bufsiz);
+
+    bs->end = bs->buf + bs->bufsiz;
+    bs->ptr = bs->end - 1;
+    bs->sr = bs->bc = 0;
+    bs->error = bs->buf ? 0 : 1;
+    bs->wrap = bs_read3;
+    return bs->error;
+}
+
+static uint32_t bs_unused_bytes (Bitstream3 *bs)
+{
+    if (bs->bc < 8) {
+        bs->bc += 8;
+        bs->ptr++;
+    }
+
+    return (uint32_t)(bs->end - bs->ptr);
+}
+
+static unsigned char *bs_unused_data (Bitstream3 *bs)
+{
+    if (bs->bc < 8) {
+        bs->bc += 8;
+        bs->ptr++;
+    }
+
+    return bs->ptr;
+}
+
+#endif  // ENABLE_LEGACY
 
diff --git a/third_party/wavpack/src/unpack3.h b/third_party/wavpack/src/unpack3.h
index cf3ca0e..ae351f8 100644
--- a/third_party/wavpack/src/unpack3.h
+++ b/third_party/wavpack/src/unpack3.h
@@ -12,9 +12,9 @@
 // decoding old (versions 1, 2 & 3) WavPack files.
 
 typedef struct {
-    unsigned short FormatTag, NumChannels;
+    uint16_t FormatTag, NumChannels;
     uint32_t SampleRate, BytesPerSecond;
-    unsigned short BlockAlign, BitsPerSample;
+    uint16_t BlockAlign, BitsPerSample;
 } WaveHeader3;
 
 #define WaveHeader3Format "SSLLSS"
@@ -22,9 +22,9 @@ typedef struct {
 typedef struct {
     char ckID [4];
     int32_t ckSize;
-    short version;
-    short bits;                 // added for version 2.00
-    short flags, shift;         // added for version 3.00
+    int16_t version;
+    int16_t bits;                 // added for version 2.00
+    int16_t flags, shift;         // added for version 3.00
     int32_t total_samples, crc, crc2;
     char extension [4], extra_bc, extras [3];
 } WavpackHeader3;
@@ -62,8 +62,9 @@ typedef struct {
 typedef struct bs3 {
     void (*wrap)(struct bs3 *bs);
     unsigned char *buf, *end, *ptr;
-    uint32_t bufsiz, fpos, sr;
-    WavpackStreamReader *reader;
+    uint32_t bufsiz, sr;
+    int64_t fpos;
+    WavpackStreamReader64 *reader;
     int error, bc;
     void *id;
 } Bitstream3;
@@ -111,3 +112,8 @@ typedef struct {
         int bits_acc [2], bitrate;
     } w4;
 } WavpackStream3;
+
+#define SAVE(destin, item) { memcpy (destin, &item, sizeof (item)); destin = (char *) destin + sizeof (item); }
+#define RESTORE(item, source) { memcpy (&item, source, sizeof (item)); source = (char *) source + sizeof (item); }
+
+void unpack_init3 (WavpackStream3 *wps);
diff --git a/third_party/wavpack/src/unpack3_open.c b/third_party/wavpack/src/unpack3_open.c
new file mode 100644
index 0000000..1572aaf
--- /dev/null
+++ b/third_party/wavpack/src/unpack3_open.c
@@ -0,0 +1,289 @@
+////////////////////////////////////////////////////////////////////////////
+//                           **** WAVPACK ****                            //
+//                  Hybrid Lossless Wavefile Compressor                   //
+//              Copyright (c) 1998 - 2013 Conifer Software.               //
+//                          All Rights Reserved.                          //
+//      Distributed under the BSD Software License (see license.txt)      //
+////////////////////////////////////////////////////////////////////////////
+
+// unpack3_open.c
+
+// This module provides an extension to the open_utils.c module for handling
+// WavPack files prior to version 4.0, not including "raw" files. As these
+// modes are all obsolete and are no longer written, this code will not be
+// fully documented other than the global functions. However, full documenation
+// is provided in the version 3.97 source code. Note that this module only
+// provides the functionality of opening the files and obtaining information
+// from them; the actual audio decoding is located in the unpack3.c module.
+
+#ifdef ENABLE_LEGACY
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "wavpack_local.h"
+#include "unpack3.h"
+
+#define ATTEMPT_ERROR_MUTING
+
+// This provides an extension to the WavpackOpenFileRead () function contained
+// in the wputils.c module. It is assumed that an 'R' had been read as the
+// first character of the file/stream (indicating a non-raw pre version 4.0
+// WavPack file) and had been pushed back onto the stream (or simply seeked
+// back to).
+
+WavpackContext *open_file3 (WavpackContext *wpc, char *error)
+{
+    RiffChunkHeader RiffChunkHeader;
+    ChunkHeader ChunkHeader;
+    WavpackHeader3 wphdr;
+    WavpackStream3 *wps;
+    WaveHeader3 wavhdr;
+
+    CLEAR (wavhdr);
+    wpc->stream3 = wps = (WavpackStream3 *) malloc (sizeof (WavpackStream3));
+    CLEAR (*wps);
+
+    if (wpc->reader->read_bytes (wpc->wv_in, &RiffChunkHeader, sizeof (RiffChunkHeader)) !=
+        sizeof (RiffChunkHeader)) {
+            if (error) strcpy (error, "not a valid WavPack file!");
+            return WavpackCloseFile (wpc);
+    }
+
+    if (!strncmp (RiffChunkHeader.ckID, "RIFF", 4) && !strncmp (RiffChunkHeader.formType, "WAVE", 4)) {
+
+        if (wpc->open_flags & OPEN_WRAPPER) {
+            wpc->wrapper_data = (unsigned char *)malloc (wpc->wrapper_bytes = sizeof (RiffChunkHeader));
+            memcpy (wpc->wrapper_data, &RiffChunkHeader, sizeof (RiffChunkHeader));
+        }
+
+    // If the first chunk is a wave RIFF header, then read the various chunks
+    // until we get to the "data" chunk (and WavPack header should follow). If
+    // the first chunk is not a RIFF, then we assume a "raw" WavPack file and
+    // the WavPack header must be first.
+
+        while (1) {
+
+            if (wpc->reader->read_bytes (wpc->wv_in, &ChunkHeader, sizeof (ChunkHeader)) !=
+                sizeof (ChunkHeader)) {
+                    if (error) strcpy (error, "not a valid WavPack file!");
+                    return WavpackCloseFile (wpc);
+            }
+            else {
+                if (wpc->open_flags & OPEN_WRAPPER) {
+                    wpc->wrapper_data = (unsigned char *)realloc (wpc->wrapper_data, wpc->wrapper_bytes + sizeof (ChunkHeader));
+                    memcpy (wpc->wrapper_data + wpc->wrapper_bytes, &ChunkHeader, sizeof (ChunkHeader));
+                    wpc->wrapper_bytes += sizeof (ChunkHeader);
+                }
+
+                WavpackLittleEndianToNative (&ChunkHeader, ChunkHeaderFormat);
+
+                if (!strncmp (ChunkHeader.ckID, "fmt ", 4)) {
+
+                    if (ChunkHeader.ckSize < sizeof (wavhdr) ||
+                        wpc->reader->read_bytes (wpc->wv_in, &wavhdr, sizeof (wavhdr)) != sizeof (wavhdr)) {
+                            if (error) strcpy (error, "not a valid WavPack file!");
+                            return WavpackCloseFile (wpc);
+                    }
+                    else if (wpc->open_flags & OPEN_WRAPPER) {
+                        wpc->wrapper_data = (unsigned char *)realloc (wpc->wrapper_data, wpc->wrapper_bytes + sizeof (wavhdr));
+                        memcpy (wpc->wrapper_data + wpc->wrapper_bytes, &wavhdr, sizeof (wavhdr));
+                        wpc->wrapper_bytes += sizeof (wavhdr);
+                    }
+
+                    WavpackLittleEndianToNative (&wavhdr, WaveHeader3Format);
+
+                    if (ChunkHeader.ckSize > sizeof (wavhdr)) {
+                        uint32_t bytes_to_skip = (ChunkHeader.ckSize + 1 - sizeof (wavhdr)) & ~1L;
+
+                        if (bytes_to_skip > 1024 * 1024) {
+                            if (error) strcpy (error, "not a valid WavPack file!");
+                            return WavpackCloseFile (wpc);
+                        }
+
+                        if (wpc->open_flags & OPEN_WRAPPER) {
+                            wpc->wrapper_data = (unsigned char *)realloc (wpc->wrapper_data, wpc->wrapper_bytes + bytes_to_skip);
+                            wpc->reader->read_bytes (wpc->wv_in, wpc->wrapper_data + wpc->wrapper_bytes, bytes_to_skip);
+                            wpc->wrapper_bytes += bytes_to_skip;
+                        }
+                        else {
+                            unsigned char *temp = (unsigned char *)malloc (bytes_to_skip);
+                            wpc->reader->read_bytes (wpc->wv_in, temp, bytes_to_skip);
+                            free (temp);
+                        }
+                    }
+                }
+                else if (!strncmp (ChunkHeader.ckID, "data", 4))
+                    break;
+                else if ((ChunkHeader.ckSize + 1) & ~1L) {
+                    uint32_t bytes_to_skip = (ChunkHeader.ckSize + 1) & ~1L;
+
+                    if (bytes_to_skip > 1024 * 1024) {
+                        if (error) strcpy (error, "not a valid WavPack file!");
+                        return WavpackCloseFile (wpc);
+                    }
+
+                    if (wpc->open_flags & OPEN_WRAPPER) {
+                        wpc->wrapper_data = (unsigned char *)realloc (wpc->wrapper_data, wpc->wrapper_bytes + bytes_to_skip);
+                        wpc->reader->read_bytes (wpc->wv_in, wpc->wrapper_data + wpc->wrapper_bytes, bytes_to_skip);
+                        wpc->wrapper_bytes += bytes_to_skip;
+                    }
+                    else {
+                        unsigned char *temp = (unsigned char *)malloc (bytes_to_skip);
+                        wpc->reader->read_bytes (wpc->wv_in, temp, bytes_to_skip);
+                        free (temp);
+                    }
+                }
+            }
+        }
+    }
+    else {
+        if (error) strcpy (error, "not a valid WavPack file!");
+        return WavpackCloseFile (wpc);
+    }
+
+    if (wavhdr.FormatTag != 1 || !wavhdr.NumChannels || wavhdr.NumChannels > 2 ||
+        !wavhdr.SampleRate || wavhdr.BitsPerSample < 16 || wavhdr.BitsPerSample > 24 ||
+        wavhdr.BlockAlign / wavhdr.NumChannels > 3 || wavhdr.BlockAlign % wavhdr.NumChannels ||
+        wavhdr.BlockAlign / wavhdr.NumChannels < (wavhdr.BitsPerSample + 7) / 8) {
+            if (error) strcpy (error, "not a valid WavPack file!");
+            return WavpackCloseFile (wpc);
+    }
+
+    wpc->total_samples = ChunkHeader.ckSize / wavhdr.NumChannels /
+        ((wavhdr.BitsPerSample > 16) ? 3 : 2);
+
+    if (wpc->reader->read_bytes (wpc->wv_in, &wphdr, 10) != 10) {
+        if (error) strcpy (error, "not a valid WavPack file!");
+        return WavpackCloseFile (wpc);
+    }
+
+    if (((char *) &wphdr) [8] == 2 && (wpc->reader->read_bytes (wpc->wv_in, ((char *) &wphdr) + 10, 2) != 2)) {
+        if (error) strcpy (error, "not a valid WavPack file!");
+        return WavpackCloseFile (wpc);
+    }
+    else if (((char *) &wphdr) [8] == 3 && (wpc->reader->read_bytes (wpc->wv_in, ((char *) &wphdr) + 10,
+        sizeof (wphdr) - 10) != sizeof (wphdr) - 10)) {
+            if (error) strcpy (error, "not a valid WavPack file!");
+            return WavpackCloseFile (wpc);
+    }
+
+    WavpackLittleEndianToNative (&wphdr, WavpackHeader3Format);
+
+    // make sure this is a version we know about
+
+    if (strncmp (wphdr.ckID, "wvpk", 4) || wphdr.version < 1 || wphdr.version > 3) {
+        if (error) strcpy (error, "not a valid WavPack file!");
+        return WavpackCloseFile (wpc);
+    }
+
+    // Because I ran out of flag bits in the WavPack header, an amazingly ugly
+    // kludge was forced upon me! This code takes care of preparing the flags
+    // field for internal use and checking for unknown formats we can't decode
+
+    if (wphdr.version == 3) {
+
+        if (wphdr.flags & EXTREME_DECORR) {
+
+            if ((wphdr.flags & NOT_STORED_FLAGS) ||
+                ((wphdr.bits) &&
+                (((wphdr.flags & NEW_HIGH_FLAG) &&
+                (wphdr.flags & (FAST_FLAG | HIGH_FLAG))) ||
+                (wphdr.flags & CROSS_DECORR)))) {
+                    if (error) strcpy (error, "not a valid WavPack file!");
+                    return WavpackCloseFile (wpc);
+            }
+
+            if (wphdr.flags & CANCEL_EXTREME)
+                wphdr.flags &= ~(EXTREME_DECORR | CANCEL_EXTREME);
+        }
+        else
+            wphdr.flags &= ~CROSS_DECORR;
+    }
+
+    // check to see if we should look for a "correction" file, and if so try
+    // to open it for reading, then set WVC_FLAG accordingly
+
+    if (wpc->wvc_in && wphdr.version == 3 && wphdr.bits && (wphdr.flags & NEW_HIGH_FLAG)) {
+        wpc->file2len = wpc->reader->get_length (wpc->wvc_in);
+        wphdr.flags |= WVC_FLAG;
+        wpc->wvc_flag = TRUE;
+    }
+    else
+        wphdr.flags &= ~WVC_FLAG;
+
+    // check WavPack version to handle special requirements of versions
+    // before 3.0 that had smaller headers
+
+    if (wphdr.version < 3) {
+        wphdr.total_samples = (int32_t) wpc->total_samples;
+        wphdr.flags = wavhdr.NumChannels == 1 ? MONO_FLAG : 0;
+        wphdr.shift = 16 - wavhdr.BitsPerSample;
+
+        if (wphdr.version == 1)
+            wphdr.bits = 0;
+    }
+
+    wpc->config.sample_rate = wavhdr.SampleRate;
+    wpc->config.num_channels = wavhdr.NumChannels;
+    wpc->config.channel_mask = 5 - wavhdr.NumChannels;
+
+    if (wphdr.flags & MONO_FLAG)
+        wpc->config.flags |= CONFIG_MONO_FLAG;
+
+    if (wphdr.flags & EXTREME_DECORR)
+        wpc->config.flags |= CONFIG_HIGH_FLAG;
+
+    if (wphdr.bits) {
+        if (wphdr.flags & NEW_HIGH_FLAG)
+            wpc->config.flags |= CONFIG_HYBRID_FLAG;
+        else
+            wpc->config.flags |= CONFIG_LOSSY_MODE;
+    }
+    else if (!(wphdr.flags & HIGH_FLAG))
+        wpc->config.flags |= CONFIG_FAST_FLAG;
+
+    wpc->config.bytes_per_sample = (wphdr.flags & BYTES_3) ? 3 : 2;
+    wpc->config.bits_per_sample = wavhdr.BitsPerSample;
+
+    memcpy (&wps->wphdr, &wphdr, sizeof (wphdr));
+    wps->wvbits.bufsiz = wps->wvcbits.bufsiz = 1024 * 1024;
+    return wpc;
+}
+
+// return currently decoded sample index
+
+uint32_t get_sample_index3 (WavpackContext *wpc)
+{
+    WavpackStream3 *wps = (WavpackStream3 *) wpc->stream3;
+
+    return (wps) ? wps->sample_index : (uint32_t) -1;
+}
+
+int get_version3 (WavpackContext *wpc)
+{
+    WavpackStream3 *wps = (WavpackStream3 *) wpc->stream3;
+
+    return (wps) ? wps->wphdr.version : 0;
+}
+
+void free_stream3 (WavpackContext *wpc)
+{
+    WavpackStream3 *wps = (WavpackStream3 *) wpc->stream3;
+
+    if (wps) {
+#ifndef NO_SEEKING
+        if (wps->unpack_data)
+            free (wps->unpack_data);
+#endif
+        if ((wps->wphdr.flags & WVC_FLAG) && wps->wvcbits.buf)
+            free (wps->wvcbits.buf);
+
+        if (wps->wvbits.buf)
+            free (wps->wvbits.buf);
+
+        free (wps);
+    }
+}
+
+#endif  // ENABLE_LEGACY
diff --git a/third_party/wavpack/src/unpack3_seek.c b/third_party/wavpack/src/unpack3_seek.c
new file mode 100644
index 0000000..f1ed27f
--- /dev/null
+++ b/third_party/wavpack/src/unpack3_seek.c
@@ -0,0 +1,212 @@
+////////////////////////////////////////////////////////////////////////////
+//                           **** WAVPACK ****                            //
+//                  Hybrid Lossless Wavefile Compressor                   //
+//              Copyright (c) 1998 - 2013 Conifer Software.               //
+//                          All Rights Reserved.                          //
+//      Distributed under the BSD Software License (see license.txt)      //
+////////////////////////////////////////////////////////////////////////////
+
+// unpack3_seek.c
+
+// This module provides seeking support for WavPack files prior to version 4.0.
+
+#ifdef ENABLE_LEGACY
+#ifndef NO_SEEKING
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "wavpack_local.h"
+#include "unpack3.h"
+
+static void *unpack_restore (WavpackStream3 *wps, void *source, int keep_resources);
+static void bs_restore3 (Bitstream3 *bs);
+
+// This is an extension for WavpackSeekSample (). Note that because WavPack
+// files created prior to version 4.0 are not inherently seekable, this
+// function could take a long time if a forward seek is requested to an
+// area that has not been played (or seeked through) yet.
+
+int seek_sample3 (WavpackContext *wpc, uint32_t desired_index)
+{
+    int points_index = desired_index / (((uint32_t) wpc->total_samples >> 8) + 1);
+    WavpackStream3 *wps = (WavpackStream3 *) wpc->stream3;
+
+    if (desired_index >= wpc->total_samples)
+        return FALSE;
+
+    while (points_index)
+        if (wps->index_points [points_index].saved &&
+            wps->index_points [points_index].sample_index <= desired_index)
+                break;
+        else
+            points_index--;
+
+    if (wps->index_points [points_index].saved)
+        if (wps->index_points [points_index].sample_index > wps->sample_index ||
+            wps->sample_index > desired_index) {
+                wps->sample_index = wps->index_points [points_index].sample_index;
+                unpack_restore (wps, wps->unpack_data + points_index * wps->unpack_size, TRUE);
+        }
+
+    if (desired_index > wps->sample_index) {
+        int32_t *buffer = (int32_t *) malloc (1024 * (wps->wphdr.flags & MONO_FLAG ? 4 : 8));
+        uint32_t samples_to_skip = desired_index - wps->sample_index;
+
+        while (1) {
+            if (samples_to_skip > 1024) {
+                if (unpack_samples3 (wpc, buffer, 1024) == 1024)
+                    samples_to_skip -= 1024;
+                else
+                    break;
+            }
+            else {
+                samples_to_skip -= unpack_samples3 (wpc, buffer, samples_to_skip);
+                break;
+            }
+        }
+
+        free (buffer);
+
+        if (samples_to_skip)
+            return FALSE;
+    }
+
+    return TRUE;
+}
+
+// This function restores the unpacking context from the specified pointer
+// and returns the updated pointer. After this call, unpack_samples() will
+// continue where it left off immediately before unpack_save() was called.
+// If the WavPack files and bitstreams might have been closed and reopened,
+// then the "keep_resources" flag should be set to avoid using the "old"
+// resources that were originally saved (and are probably now invalid).
+
+static void *unpack_restore (WavpackStream3 *wps, void *source, int keep_resources)
+{
+    int flags = wps->wphdr.flags, tcount;
+    struct decorr_pass *dpp;
+    FILE *temp_file;
+    unsigned char *temp_buf;
+
+    unpack_init3 (wps);
+    temp_file = wps->wvbits.id;
+    temp_buf = wps->wvbits.buf;
+    RESTORE (wps->wvbits, source);
+
+    if (keep_resources) {
+        wps->wvbits.id = temp_file;
+        wps->wvbits.ptr += temp_buf - wps->wvbits.buf;
+        wps->wvbits.end += temp_buf - wps->wvbits.buf;
+        wps->wvbits.buf = temp_buf;
+    }
+
+    bs_restore3 (&wps->wvbits);
+
+    if (flags & WVC_FLAG) {
+        temp_file = wps->wvcbits.id;
+        temp_buf = wps->wvcbits.buf;
+        RESTORE (wps->wvcbits, source);
+
+        if (keep_resources) {
+            wps->wvcbits.id = temp_file;
+            wps->wvcbits.ptr += temp_buf - wps->wvcbits.buf;
+            wps->wvcbits.end += temp_buf - wps->wvcbits.buf;
+            wps->wvcbits.buf = temp_buf;
+        }
+
+        bs_restore3 (&wps->wvcbits);
+    }
+
+    if (wps->wphdr.version == 3) {
+        if (wps->wphdr.bits) {
+            RESTORE (wps->w4, source);
+        }
+        else {
+            RESTORE (wps->w1, source);
+        }
+
+        RESTORE (wps->w3, source);
+        RESTORE (wps->dc.crc, source);
+    }
+    else
+        RESTORE (wps->w2, source);
+
+    if (wps->wphdr.bits) {
+        RESTORE (wps->dc.error, source);
+    }
+    else {
+        RESTORE (wps->dc.sum_level, source);
+        RESTORE (wps->dc.left_level, source);
+        RESTORE (wps->dc.right_level, source);
+        RESTORE (wps->dc.diff_level, source);
+    }
+
+    if (flags & OVER_20) {
+        RESTORE (wps->dc.last_extra_bits, source);
+        RESTORE (wps->dc.extra_bits_count, source);
+    }
+
+    if (!(flags & EXTREME_DECORR)) {
+        RESTORE (wps->dc.sample, source);
+        RESTORE (wps->dc.weight, source);
+    }
+
+    if (flags & (HIGH_FLAG | NEW_HIGH_FLAG))
+        for (tcount = wps->num_terms, dpp = wps->decorr_passes; tcount--; dpp++) {
+            if (dpp->term > 0) {
+                int count = dpp->term;
+                int index = wps->dc.m;
+
+                RESTORE (dpp->weight_A, source);
+
+                while (count--) {
+                    RESTORE (dpp->samples_A [index], source);
+                    index = (index + 1) & (MAX_TERM - 1);
+                }
+
+                if (!(flags & MONO_FLAG)) {
+                    count = dpp->term;
+                    index = wps->dc.m;
+
+                    RESTORE (dpp->weight_B, source);
+
+                    while (count--) {
+                        RESTORE (dpp->samples_B [index], source);
+                        index = (index + 1) & (MAX_TERM - 1);
+                    }
+                }
+            }
+            else {
+                RESTORE (dpp->weight_A, source);
+                RESTORE (dpp->weight_B, source);
+                RESTORE (dpp->samples_A [0], source);
+                RESTORE (dpp->samples_B [0], source);
+            }
+        }
+
+    return source;
+}
+
+// This function is called after a call to unpack_restore() has restored
+// the BitStream structure to a previous state and causes any required data
+// to be read from the file. This function is NOT supported for overlapped
+// operation.
+
+static void bs_restore3 (Bitstream3 *bs)
+{
+    uint32_t bytes_to_read = (uint32_t)(bs->end - bs->ptr - 1), bytes_read;
+
+    bs->reader->set_pos_abs (bs->id, bs->fpos - bytes_to_read);
+
+    if (bytes_to_read > 0) {
+
+        bytes_read = bs->reader->read_bytes (bs->id, bs->ptr + 1, bytes_to_read);
+
+        if (bytes_to_read != bytes_read)
+            bs->end = bs->ptr + 1 + bytes_read;
+    }
+}
+
+#endif      // NO_SEEKING
+#endif      // ENABLE_LEGACY
diff --git a/third_party/wavpack/src/unpack_armv7.S b/third_party/wavpack/src/unpack_armv7.S
new file mode 100644
index 0000000..f423de3
--- /dev/null
+++ b/third_party/wavpack/src/unpack_armv7.S
@@ -0,0 +1,887 @@
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@@                           **** WAVPACK ****                            @@
+@@                  Hybrid Lossless Wavefile Compressor                   @@
+@@              Copyright (c) 1998 - 2015 Conifer Software.               @@
+@@                          All Rights Reserved.                          @@
+@@      Distributed under the BSD Software License (see license.txt)      @@
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+        .text
+        .align
+        .global         unpack_decorr_stereo_pass_cont_armv7
+        .global         unpack_decorr_mono_pass_cont_armv7
+
+/* This is an assembly optimized version of the following WavPack function:
+ *
+ * void decorr_stereo_pass_cont (struct decorr_pass *dpp,
+ *                               int32_t *buffer,
+ *                               int32_t sample_counti,
+ *                               int32_t long_math);
+ *
+ * It performs a single pass of stereo decorrelation on the provided buffer.
+ * Note that this version of the function requires that up to 8 previous stereo
+ * samples are visible and correct. In other words, it ignores the "samples_*"
+ * fields in the decorr_pass structure and gets the history data directly
+ * from the buffer. It does, however, return the appropriate history samples
+ * to the decorr_pass structure before returning.
+ *
+ * This should work on all ARM architectures. This version of the code
+ * checks the magnitude of the decorrelation sample with a pair of shifts
+ * to avoid possible overflow (and therefore ignores the "long_math" arg).
+ * Previously I used the SSAT instruction for this, but then discovered that
+ * SSAT is not universally available (although on the armv7 I'm testing on
+ * it is slightly faster than the shifts).
+ *
+ * A mono version follows below. 
+ */
+
+/*
+ * on entry:
+ *
+ * r0 = struct decorr_pass *dpp
+ * r1 = int32_t *buffer
+ * r2 = int32_t sample_count
+ * r3 = int32_t long_math
+ */
+
+unpack_decorr_stereo_pass_cont_armv7:
+
+        stmfd   sp!, {r4 - r8, r10, r11, lr}
+
+        mov     r5, r0                  @ r5 = dpp
+        mov     r11, #512               @ r11 = 512 for rounding
+        ldr     r6, [r0, #4]            @ r6 = dpp->delta
+        ldr     r4, [r0, #8]            @ r4 = dpp->weight_A
+        ldr     r0, [r0, #12]           @ r0 = dpp->weight_B
+        cmp     r2, #0                  @ exit if no samples to process
+        beq     common_exit
+
+        add     r7, r1, r2, asl #3      @ r7 = buffer ending position
+        ldr     r2, [r5, #0]            @ r2 = dpp->term
+        cmp     r2, #0
+        bmi     minus_term
+
+        ldr     lr, [r1, #-16]          @ load 2 sample history from buffer
+        ldr     r10, [r1, #-12]         @  for terms 2, 17, and 18
+        ldr     r8, [r1, #-8]
+        ldr     r3, [r1, #-4]
+        cmp     r2, #17
+        beq     term_17_loop
+        cmp     r2, #18
+        beq     term_18_loop
+        cmp     r2, #2
+        beq     term_2_loop
+        b       term_default_loop       @ else handle default (1-8, except 2)
+
+minus_term:
+        mov     r10, #1024              @ r10 = -1024 for weight clipping
+        rsb     r10, r10, #0            @  (only used for negative terms)
+        cmn     r2, #1
+        beq     term_minus_1
+        cmn     r2, #2
+        beq     term_minus_2
+        cmn     r2, #3
+        beq     term_minus_3
+        b       common_exit
+
+/*
+ ******************************************************************************
+ * Loop to handle term = 17 condition
+ *
+ * r0 = dpp->weight_B           r8 = previous left sample
+ * r1 = bptr                    r9 =
+ * r2 = current sample          r10 = second previous right sample
+ * r3 = previous right sample   r11 = 512 (for rounding)
+ * r4 = dpp->weight_A           ip = current decorrelation value
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr = second previous left sample
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+
+term_17_loop:
+        rsb     ip, lr, r8, asl #1      @ decorr value = (2 * prev) - 2nd prev
+        mov     lr, r8                  @ previous becomes 2nd previous
+        ldr     r2, [r1], #4            @ get sample & update pointer
+        mov     r8, ip, lsl #11         @ check magnitude by shifting left then right
+        cmp     ip, r8, asr #11         @  and comparing, branch to 64-bit math if different
+        bne     S117
+        cmp     ip, #0
+        mla     r8, ip, r4, r11         @ mult decorr value by weight, round,
+        add     r8, r2, r8, asr #10     @  shift, and add to new sample
+        b       S118
+
+S117:   mov     r8, #0                  @ use 64-bit multiply to avoid overflow
+        smlal   r11, r8, r4, ip
+        add     r8, r2, r8, lsl #22
+        add     r8, r8, r11, lsr #10
+        mov     r11, #512
+
+S118:   strne   r8, [r1, #-4]           @ if change possible, store sample back
+        cmpne   r2, #0
+        beq     S325
+        teq     ip, r2                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+
+S325:   rsb     ip, r10, r3, asl #1     @ do same thing for right channel
+        mov     r10, r3
+        ldr     r2, [r1], #4
+        mov     r3, ip, lsl #11         @ check magnitude by shifting left then right
+        cmp     ip, r3, asr #11         @  and comparing, branch to 64-bit math if different
+        bne     S119
+        cmp     ip, #0
+        mla     r3, ip, r0, r11
+        add     r3, r2, r3, asr #10
+        b       S120
+
+S119:   mov     r3, #0
+        smlal   r11, r3, r0, ip
+        add     r3, r2, r3, lsl #22
+        add     r3, r3, r11, lsr #10
+        mov     r11, #512
+
+S120:   strne   r3, [r1, #-4]
+        cmpne   r2, #0
+        beq     S329
+        teq     ip, r2
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+
+S329:   cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_17_loop
+        b       store_1718              @ common exit for terms 17 & 18
+
+/*
+ ******************************************************************************
+ * Loop to handle term = 18 condition
+ *
+ * r0 = dpp->weight_B           r8 = previous left sample
+ * r1 = bptr                    r9 =
+ * r2 = current sample          r10 = second previous right sample
+ * r3 = previous right sample   r11 = 512 (for rounding)
+ * r4 = dpp->weight_A           ip = decorrelation value
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr = second previous left sample
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+
+term_18_loop:
+        sub     ip, r8, lr              @ decorr value =
+        mov     lr, r8                  @  ((3 * prev) - 2nd prev) >> 1
+        add     ip, r8, ip, asr #1
+        ldr     r2, [r1], #4            @ get sample & update pointer
+        mov     r8, ip, lsl #11         @ check magnitude by shifting left then right
+        cmp     ip, r8, asr #11         @  and comparing, branch to 64-bit math if different
+        bne     S121
+        cmp     ip, #0
+        mla     r8, ip, r4, r11         @ mult decorr value by weight, round,
+        add     r8, r2, r8, asr #10     @  shift, and add to new sample
+        b       S122
+
+S121:   mov     r8, #0                  @ use 64-bit multiply to avoid overflow
+        smlal   r11, r8, r4, ip
+        add     r8, r2, r8, lsl #22
+        add     r8, r8, r11, lsr #10
+        mov     r11, #512
+
+S122:   strne   r8, [r1, #-4]           @ if change possible, store sample back
+        cmpne   r2, #0
+        beq     S337
+        teq     ip, r2                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+
+S337:   sub     ip, r3, r10             @ do same thing for right channel
+        mov     r10, r3
+        add     ip, r3, ip, asr #1
+        ldr     r2, [r1], #4
+        mov     r3, ip, lsl #11         @ check magnitude by shifting left then right
+        cmp     ip, r3, asr #11         @  and comparing, branch to 64-bit math if different
+        bne     S123
+        cmp     ip, #0
+        mla     r3, ip, r0, r11
+        add     r3, r2, r3, asr #10
+        b       S124
+
+S123:   mov     r3, #0
+        smlal   r11, r3, r0, ip
+        add     r3, r2, r3, lsl #22
+        add     r3, r3, r11, lsr #10
+        mov     r11, #512
+
+S124:   strne   r3, [r1, #-4]
+        cmpne   r2, #0
+        beq     S341
+        teq     ip, r2
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+
+S341:   cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_18_loop
+
+/* common exit for terms 17 & 18 */
+
+store_1718:
+        str     r3, [r5, #48]           @ store sample history into struct
+        str     r8, [r5, #16]
+        str     r10, [r5, #52]
+        str     lr, [r5, #20]
+        b       common_exit             @ and return
+
+/*
+ ******************************************************************************
+ * Loop to handle term = 2 condition
+ * (note that this case can be handled by the default term handler (1-8), but
+ * this special case is faster because it doesn't have to read memory twice)
+ *
+ * r0 = dpp->weight_B           r8 = previous left sample
+ * r1 = bptr                    r9 =
+ * r2 = current sample          r10 = second previous right sample
+ * r3 = previous right sample   r11 = 512 (for rounding)
+ * r4 = dpp->weight_A           ip = decorrelation value
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr = second previous left sample
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+
+term_2_loop:
+        mov     ip, lr                  @ get decorrelation value
+        mov     lr, r8                  @ previous becomes 2nd previous
+        ldr     r2, [r1], #4            @ get sample & update pointer
+        mov     r8, ip, lsl #11         @ check magnitude by shifting left then right
+        cmp     ip, r8, asr #11         @  and comparing, branch to 64-bit math if different
+        bne     S125
+        cmp     ip, #0
+        mla     r8, ip, r4, r11         @ mult decorr value by weight, round,
+        add     r8, r2, r8, asr #10     @  shift, and add to new sample
+        b       S126
+
+S125:   mov     r8, #0                  @ use 64-bit multiply to avoid overflow
+        smlal   r11, r8, r4, ip
+        add     r8, r2, r8, lsl #22
+        add     r8, r8, r11, lsr #10
+        mov     r11, #512
+
+S126:   strne   r8, [r1, #-4]           @ if change possible, store sample back
+        cmpne   r2, #0
+        beq     S225
+        teq     ip, r2                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+
+S225:   mov     ip, r10                 @ do same thing for right channel
+        mov     r10, r3
+        ldr     r2, [r1], #4
+        mov     r3, ip, lsl #11         @ check magnitude by shifting left then right
+        cmp     ip, r3, asr #11         @  and comparing, branch to 64-bit math if different
+        bne     S127
+        cmp     ip, #0
+        mla     r3, ip, r0, r11
+        add     r3, r2, r3, asr #10
+        b       S128
+
+S127:   mov     r3, #0
+        smlal   r11, r3, r0, ip
+        add     r3, r2, r3, lsl #22
+        add     r3, r3, r11, lsr #10
+        mov     r11, #512
+
+S128:   strne   r3, [r1, #-4]
+        cmpne   r2, #0
+        beq     S229
+        teq     ip, r2
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+
+S229:   cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_2_loop
+        b       default_term_exit       @ this exit updates all dpp->samples
+
+/*
+ ******************************************************************************
+ * Loop to handle default term condition
+ *
+ * r0 = dpp->weight_B           r8 = result accumulator
+ * r1 = bptr                    r9 =
+ * r2 = dpp->term               r10 =
+ * r3 = decorrelation value     r11 = 512 (for rounding)
+ * r4 = dpp->weight_A           ip = current sample
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr =
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+
+term_default_loop:
+        ldr     ip, [r1]                @ get original sample
+        ldr     r3, [r1, -r2, asl #3]   @ get decorrelation value based on term
+        mov     r8, r3, lsl #11         @ check magnitude by shifting left then right
+        cmp     r3, r8, asr #11         @  and comparing, branch to 64-bit math if different
+        bne     S135
+        cmp     r3, #0
+        mla     r8, r3, r4, r11         @ mult decorr value by weight, round,
+        add     r8, ip, r8, asr #10     @  shift and add to new sample
+        b       S136
+
+S135:   mov     r8, #0                  @ use 64-bit multiply to avoid overflow
+        smlal   r11, r8, r4, r3
+        add     r8, ip, r8, lsl #22
+        add     r8, r8, r11, lsr #10
+        mov     r11, #512
+
+S136:   str     r8, [r1], #4            @ store update sample
+        cmpne   ip, #0
+        beq     S350
+        teq     ip, r3                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+
+S350:   ldr     ip, [r1]                @ do the same thing for right channel
+        ldr     r3, [r1, -r2, asl #3]
+        mov     r8, r3, lsl #11         @ check magnitude by shifting left then right
+        cmp     r3, r8, asr #11         @  and comparing, branch to 64-bit math if different
+        bne     S137
+        cmp     r3, #0
+        mla     r8, r3, r0, r11
+        add     r8, ip, r8, asr #10
+        b       S138
+
+S137:   mov     r8, #0
+        smlal   r11, r8, r0, r3
+        add     r8, ip, r8, lsl #22
+        add     r8, r8, r11, lsr #10
+        mov     r11, #512
+
+S138:   str     r8, [r1], #4
+        cmpne   ip, #0
+        beq     S354
+        teq     ip, r3
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+
+S354:   cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_default_loop
+
+/*
+ * This exit is used by terms 1-8 to store the previous "term" samples (up to 8)
+ * into the decorr pass structure history
+ */
+
+default_term_exit:
+        ldr     r2, [r5, #0]            @ r2 = dpp->term
+
+S358:   sub     r2, r2, #1
+        sub     r1, r1, #8
+        ldr     r3, [r1, #4]            @ get right sample and store in dpp->samples_B [r2]
+        add     r6, r5, #48
+        str     r3, [r6, r2, asl #2]
+        ldr     r3, [r1, #0]            @ get left sample and store in dpp->samples_A [r2]
+        add     r6, r5, #16
+        str     r3, [r6, r2, asl #2]
+        cmp     r2, #0
+        bne     S358
+        b       common_exit
+
+/*
+ ******************************************************************************
+ * Loop to handle term = -1 condition
+ *
+ * r0 = dpp->weight_B           r8 =
+ * r1 = bptr                    r9 =
+ * r2 = intermediate result     r10 = -1024 (for clipping)
+ * r3 = previous right sample   r11 = 512 (for rounding)
+ * r4 = dpp->weight_A           ip = current sample
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr = updated left sample
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+
+term_minus_1:
+        ldr     r3, [r1, #-4]
+
+term_minus_1_loop:
+        ldr     ip, [r1]                @ for left channel the decorrelation value
+                                        @  is the previous right sample (in r3)
+        mov     lr, r3, lsl #11         @ check magnitude by shifting left then right
+        cmp     r3, lr, asr #11         @  and comparing, branch to 64-bit math if different
+        bne     S142
+        cmp     r3, #0
+        mla     r2, r3, r4, r11
+        add     lr, ip, r2, asr #10
+        b       S143
+
+S142:   mov     lr, #0                  @ use 64-bit multiply to avoid overflow
+        smlal   r11, lr, r4, r3
+        add     lr, ip, lr, lsl #22
+        add     lr, lr, r11, lsr #10
+        mov     r11, #512
+
+S143:   str     lr, [r1], #8
+        cmpne   ip, #0
+        beq     S361
+        teq     ip, r3                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+        cmp     r4, #1024
+        movgt   r4, #1024
+        cmp     r4, r10
+        movlt   r4, r10
+
+S361:   ldr     r2, [r1, #-4]           @ for right channel the decorrelation value
+                                        @  is the just updated right sample (in lr)
+        mov     r3, lr, lsl #11         @ check magnitude by shifting left then right
+        cmp     lr, r3, asr #11         @  and comparing, branch to 64-bit math if different
+        bne     S144
+        cmp     lr, #0
+        mla     r3, lr, r0, r11
+        add     r3, r2, r3, asr #10
+        b       S145
+
+S144:   mov     r3, #0
+        smlal   r11, r3, r0, lr
+        add     r3, r2, r3, lsl #22
+        add     r3, r3, r11, lsr #10
+        mov     r11, #512
+
+S145:   strne   r3, [r1, #-4]
+        cmpne   r2, #0
+        beq     S369
+        teq     r2, lr
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+        cmp     r0, #1024               @ then clip weight to +/-1024
+        movgt   r0, #1024
+        cmp     r0, r10
+        movlt   r0, r10
+
+S369:   cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_minus_1_loop
+
+        str     r3, [r5, #16]           @ else store right sample and exit
+        b       common_exit
+
+/*
+ ******************************************************************************
+ * Loop to handle term = -2 condition
+ * (note that the channels are processed in the reverse order here)
+ *
+ * r0 = dpp->weight_B           r8 =
+ * r1 = bptr                    r9 =
+ * r2 = intermediate result     r10 = -1024 (for clipping)
+ * r3 = previous left sample    r11 = 512 (for rounding)
+ * r4 = dpp->weight_A           ip = current sample
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr = updated right sample
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+
+term_minus_2:
+        ldr     r3, [r1, #-8]
+
+term_minus_2_loop:
+        ldr     ip, [r1, #4]            @ for right channel the decorrelation value
+                                        @  is the previous left sample (in r3)
+        mov     lr, r3, lsl #11         @ check magnitude by shifting left then right
+        cmp     r3, lr, asr #11         @  and comparing, branch to 64-bit math if different
+        bne     S146
+        cmp     r3, #0
+        mla     r2, r3, r0, r11
+        add     lr, ip, r2, asr #10
+        b       S147
+
+S146:   mov     lr, #0                  @ use 64-bit multiply to avoid overflow
+        smlal   r11, lr, r0, r3
+        add     lr, ip, lr, lsl #22
+        add     lr, lr, r11, lsr #10
+        mov     r11, #512
+
+S147:   strne   lr, [r1, #4]
+        cmpne   ip, #0
+        beq     S380
+        teq     ip, r3                  @ update weight based on signs
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+        cmp     r0, #1024               @ then clip weight to +/-1024
+        movgt   r0, #1024
+        cmp     r0, r10
+        movlt   r0, r10
+
+S380:   ldr     r2, [r1, #0]            @ for left channel the decorrelation value
+                                        @  is the just updated left sample (in lr)
+        mov     r3, lr, lsl #11         @ check magnitude by shifting left then right
+        cmp     lr, r3, asr #11         @  and comparing, branch to 64-bit math if different
+        bne     S148
+        cmp     lr, #0
+        mla     r3, lr, r4, r11
+        add     r3, r2, r3, asr #10
+        b       S149
+
+S148:   mov     r3, #0
+        smlal   r11, r3, r4, lr
+        add     r3, r2, r3, lsl #22
+        add     r3, r3, r11, lsr #10
+        mov     r11, #512
+
+S149:   str     r3, [r1], #8
+        cmpne   r2, #0
+        beq     S388
+        teq     r2, lr
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+        cmp     r4, #1024
+        movgt   r4, #1024
+        cmp     r4, r10
+        movlt   r4, r10
+
+S388:   cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_minus_2_loop
+
+        str     r3, [r5, #48]           @ else store left channel and exit
+        b       common_exit
+
+/*
+ ******************************************************************************
+ * Loop to handle term = -3 condition
+ *
+ * r0 = dpp->weight_B           r8 = previous left sample
+ * r1 = bptr                    r9 =
+ * r2 = current left sample     r10 = -1024 (for clipping)
+ * r3 = previous right sample   r11 = 512 (for rounding)
+ * r4 = dpp->weight_A           ip = intermediate result
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr =
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+
+term_minus_3:
+        ldr     r3, [r1, #-4]           @ load previous samples
+        ldr     r8, [r1, #-8]
+
+term_minus_3_loop:
+        ldr     ip, [r1]
+        mov     r2, r3, lsl #11         @ check magnitude by shifting left then right
+        cmp     r3, r2, asr #11         @  and comparing, branch to 64-bit math if different
+        bne     S160
+        cmp     r3, #0
+        mla     r2, r3, r4, r11
+        add     r2, ip, r2, asr #10
+        b       S161
+
+S160:   mov     r2, #0                  @ use 64-bit multiply to avoid overflow
+        smlal   r11, r2, r4, r3
+        add     r2, ip, r2, lsl #22
+        add     r2, r2, r11, lsr #10
+        mov     r11, #512
+
+S161:   str     r2, [r1], #4
+        cmpne   ip, #0
+        beq     S399
+        teq     ip, r3                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+        cmp     r4, #1024               @ then clip weight to +/-1024
+        movgt   r4, #1024
+        cmp     r4, r10
+        movlt   r4, r10
+
+S399:   mov     ip, r8                  @ ip = previous left we use now
+        mov     r8, r2                  @ r8 = current left we use next time
+        ldr     r2, [r1], #4
+        mov     r3, ip, lsl #11         @ check magnitude by shifting left then right
+        cmp     ip, r3, asr #11         @  and comparing, branch to 64-bit math if different
+        bne     S162
+        cmp     ip, #0
+        mla     r3, ip, r0, r11
+        add     r3, r2, r3, asr #10
+        b       S163
+
+S162:   mov     r3, #0
+        smlal   r11, r3, r0, ip
+        add     r3, r2, r3, lsl #22
+        add     r3, r3, r11, lsr #10
+        mov     r11, #512
+
+S163:   strne   r3, [r1, #-4]
+        cmpne   r2, #0
+        beq     S407
+        teq     ip, r2
+        submi   r0, r0, r6
+        addpl   r0, r0, r6
+        cmp     r0, #1024
+        movgt   r0, #1024
+        cmp     r0, r10
+        movlt   r0, r10
+
+S407:   cmp     r7, r1                  @ loop back if more samples to do
+        bhi     term_minus_3_loop
+
+        str     r3, [r5, #16]           @ else store previous samples & exit
+        str     r8, [r5, #48]
+
+/*
+ * Before finally exiting we must store weights back for next time
+ */
+
+common_exit:
+        str     r4, [r5, #8]
+        str     r0, [r5, #12]
+        ldmfd   sp!, {r4 - r8, r10, r11, pc}
+
+
+
+/* This is a mono version of the function above. It does not handle negative terms.
+ *
+ * void decorr_mono_pass_cont (struct decorr_pass *dpp,
+ *                             int32_t *buffer,
+ *                             int32_t sample_counti,
+ *                             int32_t long_math);
+ * on entry:
+ *
+ * r0 = struct decorr_pass *dpp
+ * r1 = int32_t *buffer
+ * r2 = int32_t sample_count
+ * r3 = int32_t long_math
+ */
+
+unpack_decorr_mono_pass_cont_armv7:
+
+        stmfd   sp!, {r4 - r8, r11, lr}
+
+        mov     r5, r0                  @ r5 = dpp
+        mov     r11, #512               @ r11 = 512 for rounding
+        ldr     r6, [r0, #4]            @ r6 = dpp->delta
+        ldr     r4, [r0, #8]            @ r4 = dpp->weight_A
+        cmp     r2, #0                  @ exit if no samples to process
+        beq     mono_common_exit
+
+        add     r7, r1, r2, asl #2      @ r7 = buffer ending position
+        ldr     r2, [r5, #0]            @ r2 = dpp->term
+
+        ldr     lr, [r1, #-8]           @ load 2 sample history from buffer
+        ldr     r8, [r1, #-4]
+        cmp     r2, #17
+        beq     mono_term_17_loop
+        cmp     r2, #18
+        beq     mono_term_18_loop
+        cmp     r2, #2
+        beq     mono_term_2_loop
+        b       mono_term_default_loop  @ else handle default (1-8, except 2)
+
+/*
+ ******************************************************************************
+ * Loop to handle term = 17 condition
+ *
+ * r0 =                         r8 = previous sample
+ * r1 = bptr                    r9 =
+ * r2 = current sample          r10 =
+ * r3 =                         r11 = 512 (for rounding)
+ * r4 = dpp->weight_A           ip = current decorrelation value
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr = second previous sample
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+
+mono_term_17_loop:
+        rsb     ip, lr, r8, asl #1      @ decorr value = (2 * prev) - 2nd prev
+        mov     lr, r8                  @ previous becomes 2nd previous
+        ldr     r2, [r1], #4            @ get sample & update pointer
+        mov     r8, ip, lsl #11         @ check magnitude by shifting left then right
+        cmp     ip, r8, asr #11         @  and comparing, branch to 64-bit math if different
+        bne     S717
+        cmp     ip, #0
+        mla     r8, ip, r4, r11         @ mult decorr value by weight, round,
+        add     r8, r2, r8, asr #10     @  shift, and add to new sample
+        b       S718
+
+S717:   mov     r8, #0
+        smlal   r11, r8, r4, ip
+        add     r8, r2, r8, lsl #22
+        add     r8, r8, r11, lsr #10
+        mov     r11, #512
+
+S718:   strne   r8, [r1, #-4]           @ if change possible, store sample back
+        cmpne   r2, #0
+        beq     S129
+        teq     ip, r2                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+
+S129:   cmp     r7, r1                  @ loop back if more samples to do
+        bhi     mono_term_17_loop
+        b       mono_store_1718         @ common exit for terms 17 & 18
+
+/*
+ ******************************************************************************
+ * Loop to handle term = 18 condition
+ *
+ * r0 =                         r8 = previous sample
+ * r1 = bptr                    r9 =
+ * r2 = current sample          r10 =
+ * r3 =                         r11 = 512 (for rounding)
+ * r4 = dpp->weight_A           ip = decorrelation value
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr = second previous sample
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+
+mono_term_18_loop:
+        sub     ip, r8, lr              @ decorr value =
+        mov     lr, r8                  @  ((3 * prev) - 2nd prev) >> 1
+        add     ip, r8, ip, asr #1
+        ldr     r2, [r1], #4            @ get sample & update pointer
+        mov     r8, ip, lsl #11         @ check magnitude by shifting left then right
+        cmp     ip, r8, asr #11         @  and comparing, branch to 64-bit math if different
+        bne     S817
+        cmp     ip, #0
+        mla     r8, ip, r4, r11         @ mult decorr value by weight, round,
+        add     r8, r2, r8, asr #10     @  shift, and add to new sample
+        b       S818
+
+S817:   mov     r8, #0
+        smlal   r11, r8, r4, ip
+        add     r8, r2, r8, lsl #22
+        add     r8, r8, r11, lsr #10
+        mov     r11, #512
+
+S818:   strne   r8, [r1, #-4]           @ if change possible, store sample back
+        cmpne   r2, #0
+        beq     S141
+        teq     ip, r2                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+
+S141:   cmp     r7, r1                  @ loop back if more samples to do
+        bhi     mono_term_18_loop
+
+/* common exit for terms 17 & 18 */
+
+mono_store_1718:
+        str     r8, [r5, #16]           @ store sample history into struct
+        str     lr, [r5, #20]
+        b       mono_common_exit        @ and return
+
+/*
+ ******************************************************************************
+ * Loop to handle term = 2 condition
+ * (note that this case can be handled by the default term handler (1-8), but
+ * this special case is faster because it doesn't have to read memory twice)
+ *
+ * r0 =                         r8 = previous sample
+ * r1 = bptr                    r9 =
+ * r2 = current sample          r10 =
+ * r3 =                         r11 = 512 (for rounding)
+ * r4 = dpp->weight_A           ip = decorrelation value
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr = second previous sample
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+
+mono_term_2_loop:
+        mov     ip, lr                  @ get decorrelation value
+        mov     lr, r8                  @ previous becomes 2nd previous
+        ldr     r2, [r1], #4            @ get sample & update pointer
+        mov     r8, ip, lsl #11         @ check magnitude by shifting left then right
+        cmp     ip, r8, asr #11         @  and comparing, branch to 64-bit math if different
+        bne     S917
+        cmp     ip, #0
+        mla     r8, ip, r4, r11         @ mult decorr value by weight, round,
+        add     r8, r2, r8, asr #10     @  shift, and add to new sample
+        b       S918
+
+S917:   mov     r8, #0
+        smlal   r11, r8, r4, ip
+        add     r8, r2, r8, lsl #22
+        add     r8, r8, r11, lsr #10
+        mov     r11, #512
+
+S918:   strne   r8, [r1, #-4]           @ if change possible, store sample back
+        cmpne   r2, #0
+        beq     S029
+        teq     ip, r2                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+
+S029:   cmp     r7, r1                  @ loop back if more samples to do
+        bhi     mono_term_2_loop
+        b       mono_default_term_exit  @ this exit updates all dpp->samples
+
+/*
+ ******************************************************************************
+ * Loop to handle default term condition
+ *
+ * r0 =                         r8 = result accumulator
+ * r1 = bptr                    r9 =
+ * r2 = dpp->term               r10 =
+ * r3 = decorrelation value     r11 = 512 (for rounding)
+ * r4 = dpp->weight_A           ip = current sample
+ * r5 = dpp                     sp =
+ * r6 = dpp->delta              lr =
+ * r7 = eptr                    pc =
+ *******************************************************************************
+ */
+
+mono_term_default_loop:
+        ldr     ip, [r1]                @ get original sample
+        ldr     r3, [r1, -r2, asl #2]   @ get decorrelation value based on term
+        mov     r8, r3, lsl #11         @ check magnitude by shifting left then right
+        cmp     r3, r8, asr #11         @  and comparing, branch to 64-bit math if different
+        bne     S617
+        mla     r8, r3, r4, r11         @ mult decorr value by weight, round,
+        add     r8, ip, r8, asr #10     @  shift and add to new sample
+        b       S618
+
+S617:   mov     r8, #0
+        smlal   r11, r8, r4, r3
+        add     r8, ip, r8, lsl #22
+        add     r8, r8, r11, lsr #10
+        mov     r11, #512
+
+S618:   str     r8, [r1], #4            @ store update sample
+        cmp     r3, #0
+        cmpne   ip, #0
+        beq     S154
+        teq     ip, r3                  @ update weight based on signs
+        submi   r4, r4, r6
+        addpl   r4, r4, r6
+
+S154:   cmp     r7, r1                  @ loop back if more samples to do
+        bhi     mono_term_default_loop
+
+/*
+ * This exit is used by terms 1-8 to store the previous "term" samples (up to 8)
+ * into the decorr pass structure history
+ */
+
+mono_default_term_exit:
+        ldr     r2, [r5, #0]            @ r2 = dpp->term
+
+S158:   sub     r2, r2, #1
+        sub     r1, r1, #4
+        ldr     r3, [r1, #0]            @ get sample and store in dpp->samples_A [r2]
+        add     r6, r5, #16
+        str     r3, [r6, r2, asl #2]
+        cmp     r2, #0
+        bne     S158
+        b       mono_common_exit
+
+/*
+ * Before finally exiting we must store weight back for next time
+ */
+
+mono_common_exit:
+        str     r4, [r5, #8]
+        ldmfd   sp!, {r4 - r8, r11, pc}
+
+#ifdef __ELF__
+        .section .note.GNU-stack,"",%progbits
+#endif
+
diff --git a/third_party/wavpack/src/unpack_dsd.c b/third_party/wavpack/src/unpack_dsd.c
new file mode 100644
index 0000000..11aa04f
--- /dev/null
+++ b/third_party/wavpack/src/unpack_dsd.c
@@ -0,0 +1,616 @@
+////////////////////////////////////////////////////////////////////////////
+//                           **** DSDPACK ****                            //
+//         Lossless DSD (Direct Stream Digital) Audio Compressor          //
+//                Copyright (c) 2013 - 2016 David Bryant.                 //
+//                          All Rights Reserved.                          //
+//      Distributed under the BSD Software License (see license.txt)      //
+////////////////////////////////////////////////////////////////////////////
+
+// unpack_dsd.c
+
+// This module actually handles the uncompression of the DSD audio data.
+
+#ifdef ENABLE_DSD
+
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "wavpack_local.h"
+
+///////////////////////////// executable code ////////////////////////////////
+
+// This function initialzes the main range-encoded data for DSD audio samples
+
+static int init_dsd_block_fast (WavpackStream *wps, WavpackMetadata *wpmd);
+static int init_dsd_block_high (WavpackStream *wps, WavpackMetadata *wpmd);
+static int decode_fast (WavpackStream *wps, int32_t *output, int sample_count);
+static int decode_high (WavpackStream *wps, int32_t *output, int sample_count);
+
+int init_dsd_block (WavpackContext *wpc, WavpackMetadata *wpmd)
+{
+    WavpackStream *wps = wpc->streams [wpc->current_stream];
+
+    if (wpmd->byte_length < 2)
+        return FALSE;
+
+    wps->dsd.byteptr = (unsigned char *)wpmd->data;
+    wps->dsd.endptr = wps->dsd.byteptr + wpmd->byte_length;
+    wpc->dsd_multiplier = 1 << *wps->dsd.byteptr++;
+    wps->dsd.mode = *wps->dsd.byteptr++;
+
+    if (!wps->dsd.mode) {
+        if (wps->dsd.endptr - wps->dsd.byteptr != wps->wphdr.block_samples * (wps->wphdr.flags & MONO_DATA ? 1 : 2)) {
+            return FALSE;
+        }
+
+        wps->dsd.ready = 1;
+        return TRUE;
+    }
+
+    if (wps->dsd.mode == 1)
+        return init_dsd_block_fast (wps, wpmd);
+    else if (wps->dsd.mode == 3)
+        return init_dsd_block_high (wps, wpmd);
+    else
+        return FALSE;
+}
+
+int32_t unpack_dsd_samples (WavpackContext *wpc, int32_t *buffer, uint32_t sample_count)
+{
+    WavpackStream *wps = wpc->streams [wpc->current_stream];
+    uint32_t flags = wps->wphdr.flags;
+
+    // don't attempt to decode past the end of the block, but watch out for overflow!
+
+    if (wps->sample_index + sample_count > GET_BLOCK_INDEX (wps->wphdr) + wps->wphdr.block_samples &&
+        GET_BLOCK_INDEX (wps->wphdr) + wps->wphdr.block_samples - wps->sample_index < sample_count)
+            sample_count = (uint32_t) (GET_BLOCK_INDEX (wps->wphdr) + wps->wphdr.block_samples - wps->sample_index);
+
+    if (GET_BLOCK_INDEX (wps->wphdr) > wps->sample_index || wps->wphdr.block_samples < sample_count)
+        wps->mute_error = TRUE;
+
+    if (!wps->mute_error) {
+        if (!wps->dsd.mode) {
+            int total_samples = sample_count * ((flags & MONO_DATA) ? 1 : 2);
+            int32_t *bptr = buffer;
+
+            if (wps->dsd.endptr - wps->dsd.byteptr < total_samples)
+                total_samples = (int)(wps->dsd.endptr - wps->dsd.byteptr);
+
+            while (total_samples--)
+                wps->crc += (wps->crc << 1) + (*bptr++ = *wps->dsd.byteptr++);
+        }
+        else if (wps->dsd.mode == 1) {
+            if (!decode_fast (wps, buffer, sample_count))
+                wps->mute_error = TRUE;
+        }
+        else if (!decode_high (wps, buffer, sample_count))
+            wps->mute_error = TRUE;
+    }
+
+    if (wps->mute_error) {
+        int samples_to_null;
+        if (wpc->reduced_channels == 1 || wpc->config.num_channels == 1 || (flags & MONO_FLAG))
+            samples_to_null = sample_count;
+        else
+            samples_to_null = sample_count * 2;
+
+        while (samples_to_null--)
+            *buffer++ = 0x55;
+
+        wps->sample_index += sample_count;
+        return sample_count;
+    }
+
+    if (flags & FALSE_STEREO) {
+        int32_t *dptr = buffer + sample_count * 2;
+        int32_t *sptr = buffer + sample_count;
+        int32_t c = sample_count;
+
+        while (c--) {
+            *--dptr = *--sptr;
+            *--dptr = *sptr;
+        }
+    }
+
+    wps->sample_index += sample_count;
+
+    return sample_count;
+}
+
+/*------------------------------------------------------------------------------------------------------------------------*/
+
+// #define DSD_BYTE_READY(low,high) (((low) >> 24) == ((high) >> 24))
+// #define DSD_BYTE_READY(low,high) (!(((low) ^ (high)) >> 24))
+#define DSD_BYTE_READY(low,high) (!(((low) ^ (high)) & 0xff000000))
+#define MAX_HISTORY_BITS    5
+
+static int init_dsd_block_fast (WavpackStream *wps, WavpackMetadata *wpmd)
+{
+    unsigned char history_bits, max_probability;
+    int total_summed_probabilities = 0, i;
+
+    if (wps->dsd.byteptr == wps->dsd.endptr)
+        return FALSE;
+
+    history_bits = *wps->dsd.byteptr++;
+
+    if (wps->dsd.byteptr == wps->dsd.endptr || history_bits > MAX_HISTORY_BITS)
+        return FALSE;
+
+    wps->dsd.history_bins = 1 << history_bits;
+
+    wps->dsd.value_lookup = (unsigned char **)malloc (sizeof (*wps->dsd.value_lookup) * wps->dsd.history_bins);
+    memset (wps->dsd.value_lookup, 0, sizeof (*wps->dsd.value_lookup) * wps->dsd.history_bins);
+    wps->dsd.summed_probabilities = (int16_t (*)[256])malloc (sizeof (*wps->dsd.summed_probabilities) * wps->dsd.history_bins);
+    wps->dsd.probabilities = (unsigned char (*)[256])malloc (sizeof (*wps->dsd.probabilities) * wps->dsd.history_bins);
+
+    max_probability = *wps->dsd.byteptr++;
+
+    if (max_probability < 0xff) {
+        unsigned char *outptr = (unsigned char *) wps->dsd.probabilities;
+        unsigned char *outend = outptr + sizeof (*wps->dsd.probabilities) * wps->dsd.history_bins;
+
+        while (outptr < outend && wps->dsd.byteptr < wps->dsd.endptr) {
+            int code = *wps->dsd.byteptr++;
+
+            if (code > max_probability) {
+                int zcount = code - max_probability;
+
+                while (outptr < outend && zcount--)
+                    *outptr++ = 0;
+            }
+            else if (code)
+                *outptr++ = code;
+            else
+                break;
+        }
+
+        if (outptr < outend || (wps->dsd.byteptr < wps->dsd.endptr && *wps->dsd.byteptr++))
+            return FALSE;
+    }
+    else if (wps->dsd.endptr - wps->dsd.byteptr > (int) sizeof (*wps->dsd.probabilities) * wps->dsd.history_bins) {
+        memcpy (wps->dsd.probabilities, wps->dsd.byteptr, sizeof (*wps->dsd.probabilities) * wps->dsd.history_bins);
+        wps->dsd.byteptr += sizeof (*wps->dsd.probabilities) * wps->dsd.history_bins;
+    }
+    else
+        return FALSE;
+
+    for (wps->dsd.p0 = 0; wps->dsd.p0 < wps->dsd.history_bins; ++wps->dsd.p0) {
+        int32_t sum_values;
+        unsigned char *vp;
+
+        for (sum_values = i = 0; i < 256; ++i)
+            wps->dsd.summed_probabilities [wps->dsd.p0] [i] = sum_values += wps->dsd.probabilities [wps->dsd.p0] [i];
+
+        if (sum_values) {
+            total_summed_probabilities += sum_values;
+            vp = wps->dsd.value_lookup [wps->dsd.p0] = (unsigned char *)malloc (sum_values);
+
+            for (i = 0; i < 256; i++) {
+                int c = wps->dsd.probabilities [wps->dsd.p0] [i];
+
+                while (c--)
+                    *vp++ = i;
+            }
+        }
+    }
+
+    if (wps->dsd.endptr - wps->dsd.byteptr < 4 || total_summed_probabilities > wps->dsd.history_bins * 1280)
+        return FALSE;
+
+    for (i = 4; i--;)
+        wps->dsd.value = (wps->dsd.value << 8) | *wps->dsd.byteptr++;
+
+    wps->dsd.p0 = wps->dsd.p1 = 0;
+    wps->dsd.low = 0; wps->dsd.high = 0xffffffff;
+    wps->dsd.ready = 1;
+
+    return TRUE;
+}
+
+static int decode_fast (WavpackStream *wps, int32_t *output, int sample_count)
+{
+    int total_samples = sample_count;
+
+    if (!(wps->wphdr.flags & MONO_DATA))
+        total_samples *= 2;
+
+    while (total_samples--) {
+        int mult, index, code, i;
+
+        if (!wps->dsd.summed_probabilities [wps->dsd.p0] [255])
+            return 0;
+
+        mult = (wps->dsd.high - wps->dsd.low) / wps->dsd.summed_probabilities [wps->dsd.p0] [255];
+
+        if (!mult) {
+            if (wps->dsd.endptr - wps->dsd.byteptr >= 4)
+                for (i = 4; i--;)
+                    wps->dsd.value = (wps->dsd.value << 8) | *wps->dsd.byteptr++;
+
+            wps->dsd.low = 0;
+            wps->dsd.high = 0xffffffff;
+            mult = wps->dsd.high / wps->dsd.summed_probabilities [wps->dsd.p0] [255];
+
+            if (!mult)
+                return 0;
+        }
+
+        index = (wps->dsd.value - wps->dsd.low) / mult;
+
+        if (index >= wps->dsd.summed_probabilities [wps->dsd.p0] [255])
+            return 0;
+
+        if ((*output++ = code = wps->dsd.value_lookup [wps->dsd.p0] [index]))
+            wps->dsd.low += wps->dsd.summed_probabilities [wps->dsd.p0] [code-1] * mult;
+
+        wps->dsd.high = wps->dsd.low + wps->dsd.probabilities [wps->dsd.p0] [code] * mult - 1;
+        wps->crc += (wps->crc << 1) + code;
+
+        if (wps->wphdr.flags & MONO_DATA)
+            wps->dsd.p0 = code & (wps->dsd.history_bins-1);
+        else {
+            wps->dsd.p0 = wps->dsd.p1;
+            wps->dsd.p1 = code & (wps->dsd.history_bins-1);
+        }
+
+        while (DSD_BYTE_READY (wps->dsd.high, wps->dsd.low) && wps->dsd.byteptr < wps->dsd.endptr) {
+            wps->dsd.value = (wps->dsd.value << 8) | *wps->dsd.byteptr++;
+            wps->dsd.high = (wps->dsd.high << 8) | 0xff;
+            wps->dsd.low <<= 8;
+        }
+    }
+
+    return sample_count;
+}
+
+/*------------------------------------------------------------------------------------------------------------------------*/
+
+#define PTABLE_BITS 8
+#define PTABLE_BINS (1<<PTABLE_BITS)
+#define PTABLE_MASK (PTABLE_BINS-1)
+
+#define UP   0x010000fe
+#define DOWN 0x00010000
+#define DECAY 8
+
+#define PRECISION 20
+#define VALUE_ONE (1 << PRECISION)
+#define PRECISION_USE 12
+
+#define RATE_S 20
+
+static void init_ptable (int *table, int rate_i, int rate_s)
+{
+    int value = 0x808000, rate = rate_i << 8, c, i;
+
+    for (c = (rate + 128) >> 8; c--;)
+        value += (DOWN - value) >> DECAY;
+
+    for (i = 0; i < PTABLE_BINS/2; ++i) {
+        table [i] = value;
+        table [PTABLE_BINS-1-i] = 0x100ffff - value;
+
+        if (value > 0x010000) {
+            rate += (rate * rate_s + 128) >> 8;
+
+            for (c = (rate + 64) >> 7; c--;)
+                value += (DOWN - value) >> DECAY;
+        }
+    }
+}
+
+static int init_dsd_block_high (WavpackStream *wps, WavpackMetadata *wpmd)
+{
+    uint32_t flags = wps->wphdr.flags;
+    int channel, rate_i, rate_s, i;
+
+    if (wps->dsd.endptr - wps->dsd.byteptr < ((flags & MONO_DATA) ? 13 : 20))
+        return FALSE;
+
+    rate_i = *wps->dsd.byteptr++;
+    rate_s = *wps->dsd.byteptr++;
+
+    if (rate_s != RATE_S)
+        return FALSE;
+
+    wps->dsd.ptable = (int32_t *)malloc (PTABLE_BINS * sizeof (*wps->dsd.ptable));
+    init_ptable (wps->dsd.ptable, rate_i, rate_s);
+
+    for (channel = 0; channel < ((flags & MONO_DATA) ? 1 : 2); ++channel) {
+        DSDfilters *sp = wps->dsd.filters + channel;
+
+        sp->filter1 = *wps->dsd.byteptr++ << (PRECISION - 8);
+        sp->filter2 = *wps->dsd.byteptr++ << (PRECISION - 8);
+        sp->filter3 = *wps->dsd.byteptr++ << (PRECISION - 8);
+        sp->filter4 = *wps->dsd.byteptr++ << (PRECISION - 8);
+        sp->filter5 = *wps->dsd.byteptr++ << (PRECISION - 8);
+        sp->filter6 = 0;
+        sp->factor = *wps->dsd.byteptr++ & 0xff;
+        sp->factor |= (*wps->dsd.byteptr++ << 8) & 0xff00;
+        sp->factor = (sp->factor << 16) >> 16;
+    }
+
+    wps->dsd.high = 0xffffffff;
+    wps->dsd.low = 0x0;
+
+    for (i = 4; i--;)
+        wps->dsd.value = (wps->dsd.value << 8) | *wps->dsd.byteptr++;
+
+    wps->dsd.ready = 1;
+
+    return TRUE;
+}
+
+static int decode_high (WavpackStream *wps, int32_t *output, int sample_count)
+{
+    int total_samples = sample_count, stereo = (wps->wphdr.flags & MONO_DATA) ? 0 : 1;
+    DSDfilters *sp = wps->dsd.filters;
+
+    while (total_samples--) {
+        int bitcount = 8;
+
+        sp [0].value = sp [0].filter1 - sp [0].filter5 + ((sp [0].filter6 * sp [0].factor) >> 2);
+
+        if (stereo)
+            sp [1].value = sp [1].filter1 - sp [1].filter5 + ((sp [1].filter6 * sp [1].factor) >> 2);
+
+        while (bitcount--) {
+            int32_t *pp = wps->dsd.ptable + ((sp [0].value >> (PRECISION - PRECISION_USE)) & PTABLE_MASK);
+            uint32_t split = wps->dsd.low + ((wps->dsd.high - wps->dsd.low) >> 8) * (*pp >> 16);
+
+            if (wps->dsd.value <= split) {
+                wps->dsd.high = split;
+                *pp += (UP - *pp) >> DECAY;
+                sp [0].filter0 = -1;
+            }
+            else {
+                wps->dsd.low = split + 1;
+                *pp += (DOWN - *pp) >> DECAY;
+                sp [0].filter0 = 0;
+            }
+
+            while (DSD_BYTE_READY (wps->dsd.high, wps->dsd.low) && wps->dsd.byteptr < wps->dsd.endptr) {
+                wps->dsd.value = (wps->dsd.value << 8) | *wps->dsd.byteptr++;
+                wps->dsd.high = (wps->dsd.high << 8) | 0xff;
+                wps->dsd.low <<= 8;
+            }
+
+            sp [0].value += sp [0].filter6 << 3;
+            sp [0].byte = (sp [0].byte << 1) | (sp [0].filter0 & 1);
+            sp [0].factor += (((sp [0].value ^ sp [0].filter0) >> 31) | 1) & ((sp [0].value ^ (sp [0].value - (sp [0].filter6 << 4))) >> 31);
+            sp [0].filter1 += ((sp [0].filter0 & VALUE_ONE) - sp [0].filter1) >> 6;
+            sp [0].filter2 += ((sp [0].filter0 & VALUE_ONE) - sp [0].filter2) >> 4;
+            sp [0].filter3 += (sp [0].filter2 - sp [0].filter3) >> 4;
+            sp [0].filter4 += (sp [0].filter3 - sp [0].filter4) >> 4;
+            sp [0].value = (sp [0].filter4 - sp [0].filter5) >> 4;
+            sp [0].filter5 += sp [0].value;
+            sp [0].filter6 += (sp [0].value - sp [0].filter6) >> 3;
+            sp [0].value = sp [0].filter1 - sp [0].filter5 + ((sp [0].filter6 * sp [0].factor) >> 2);
+
+            if (!stereo)
+                continue;
+
+            pp = wps->dsd.ptable + ((sp [1].value >> (PRECISION - PRECISION_USE)) & PTABLE_MASK);
+            split = wps->dsd.low + ((wps->dsd.high - wps->dsd.low) >> 8) * (*pp >> 16);
+
+            if (wps->dsd.value <= split) {
+                wps->dsd.high = split;
+                *pp += (UP - *pp) >> DECAY;
+                sp [1].filter0 = -1;
+            }
+            else {
+                wps->dsd.low = split + 1;
+                *pp += (DOWN - *pp) >> DECAY;
+                sp [1].filter0 = 0;
+            }
+
+            while (DSD_BYTE_READY (wps->dsd.high, wps->dsd.low) && wps->dsd.byteptr < wps->dsd.endptr) {
+                wps->dsd.value = (wps->dsd.value << 8) | *wps->dsd.byteptr++;
+                wps->dsd.high = (wps->dsd.high << 8) | 0xff;
+                wps->dsd.low <<= 8;
+            }
+
+            sp [1].value += sp [1].filter6 << 3;
+            sp [1].byte = (sp [1].byte << 1) | (sp [1].filter0 & 1);
+            sp [1].factor += (((sp [1].value ^ sp [1].filter0) >> 31) | 1) & ((sp [1].value ^ (sp [1].value - (sp [1].filter6 << 4))) >> 31);
+            sp [1].filter1 += ((sp [1].filter0 & VALUE_ONE) - sp [1].filter1) >> 6;
+            sp [1].filter2 += ((sp [1].filter0 & VALUE_ONE) - sp [1].filter2) >> 4;
+            sp [1].filter3 += (sp [1].filter2 - sp [1].filter3) >> 4;
+            sp [1].filter4 += (sp [1].filter3 - sp [1].filter4) >> 4;
+            sp [1].value = (sp [1].filter4 - sp [1].filter5) >> 4;
+            sp [1].filter5 += sp [1].value;
+            sp [1].filter6 += (sp [1].value - sp [1].filter6) >> 3;
+            sp [1].value = sp [1].filter1 - sp [1].filter5 + ((sp [1].filter6 * sp [1].factor) >> 2);
+        }
+
+        wps->crc += (wps->crc << 1) + (*output++ = sp [0].byte & 0xff);
+        sp [0].factor -= (sp [0].factor + 512) >> 10;
+
+        if (stereo) {
+            wps->crc += (wps->crc << 1) + (*output++ = wps->dsd.filters [1].byte & 0xff);
+            wps->dsd.filters [1].factor -= (wps->dsd.filters [1].factor + 512) >> 10;
+        }
+    }
+
+    return sample_count;
+}
+
+/*------------------------------------------------------------------------------------------------------------------------*/
+
+#if 0
+
+// 80 term DSD decimation filter
+// < 1 dB down at 20 kHz
+// > 108 dB stopband attenuation (fs/16)
+
+static const int32_t decm_filter [] = {
+    4, 17, 56, 147, 336, 693, 1320, 2359,
+    4003, 6502, 10170, 15392, 22623, 32389, 45275, 61920,
+    82994, 109174, 141119, 179431, 224621, 277068, 336983, 404373,
+    479004, 560384, 647741, 740025, 835917, 933849, 1032042, 1128551,
+    1221329, 1308290, 1387386, 1456680, 1514425, 1559128, 1589610, 1605059,
+    1605059, 1589610, 1559128, 1514425, 1456680, 1387386, 1308290, 1221329,
+    1128551, 1032042, 933849, 835917, 740025, 647741, 560384, 479004,
+    404373, 336983, 277068, 224621, 179431, 141119, 109174, 82994,
+    61920, 45275, 32389, 22623, 15392, 10170, 6502, 4003,
+    2359, 1320, 693, 336, 147, 56, 17, 4,
+};
+
+#define NUM_FILTER_TERMS 80
+
+#else
+
+// 56 term decimation filter
+// < 0.5 dB down at 20 kHz
+// > 100 dB stopband attenuation (fs/12)
+
+static const int32_t decm_filter [] = {
+    4, 17, 56, 147, 336, 692, 1315, 2337,
+    3926, 6281, 9631, 14216, 20275, 28021, 37619, 49155,
+    62616, 77870, 94649, 112551, 131049, 149507, 167220, 183448,
+    197472, 208636, 216402, 220385, 220385, 216402, 208636, 197472,
+    183448, 167220, 149507, 131049, 112551, 94649, 77870, 62616,
+    49155, 37619, 28021, 20275, 14216, 9631, 6281, 3926,
+    2337, 1315, 692, 336, 147, 56, 17, 4,
+};
+
+#define NUM_FILTER_TERMS 56
+
+#endif
+
+#define HISTORY_BYTES ((NUM_FILTER_TERMS+7)/8)
+
+typedef struct {
+    unsigned char delay [HISTORY_BYTES];
+} DecimationChannel;
+
+typedef struct {
+    int32_t conv_tables [HISTORY_BYTES] [256];
+    DecimationChannel *chans;
+    int num_channels;
+} DecimationContext;
+
+void *decimate_dsd_init (int num_channels)
+{
+    DecimationContext *context = (DecimationContext *)malloc (sizeof (DecimationContext));
+    double filter_sum = 0, filter_scale;
+    int skipped_terms, i, j;
+
+    if (!context)
+        return context;
+
+    memset (context, 0, sizeof (*context));
+    context->num_channels = num_channels;
+    context->chans = (DecimationChannel *)malloc (num_channels * sizeof (DecimationChannel));
+
+    if (!context->chans) {
+        free (context);
+        return NULL;
+    }
+
+    for (i = 0; i < NUM_FILTER_TERMS; ++i)
+        filter_sum += decm_filter [i];
+
+    filter_scale = ((1 << 23) - 1) / filter_sum * 16.0;
+    // fprintf (stderr, "convolution, %d terms, %f sum, %f scale\n", NUM_FILTER_TERMS, filter_sum, filter_scale);
+
+    for (skipped_terms = i = 0; i < NUM_FILTER_TERMS; ++i) {
+        int scaled_term = (int) floor (decm_filter [i] * filter_scale + 0.5);
+
+        if (scaled_term) {
+            for (j = 0; j < 256; ++j)
+                if (j & (0x80 >> (i & 0x7)))
+                    context->conv_tables [i >> 3] [j] += scaled_term;
+                else
+                    context->conv_tables [i >> 3] [j] -= scaled_term;
+        }
+        else
+            skipped_terms++;
+    }
+
+    // fprintf (stderr, "%d terms skipped\n", skipped_terms);
+
+    decimate_dsd_reset (context);
+
+    return context;
+}
+
+void decimate_dsd_reset (void *decimate_context)
+{
+    DecimationContext *context = (DecimationContext *) decimate_context;
+    int chan = 0, i;
+
+    if (!context)
+        return;
+
+    for (chan = 0; chan < context->num_channels; ++chan)
+        for (i = 0; i < HISTORY_BYTES; ++i)
+            context->chans [chan].delay [i] = 0x55;
+}
+
+void decimate_dsd_run (void *decimate_context, int32_t *samples, int num_samples)
+{
+    DecimationContext *context = (DecimationContext *) decimate_context;
+    int chan = 0;
+
+    if (!context)
+        return;
+
+    while (num_samples) {
+        DecimationChannel *sp = context->chans + chan;
+        int sum = 0;
+
+#if (HISTORY_BYTES == 10)
+        sum += context->conv_tables [0] [sp->delay [0] = sp->delay [1]];
+        sum += context->conv_tables [1] [sp->delay [1] = sp->delay [2]];
+        sum += context->conv_tables [2] [sp->delay [2] = sp->delay [3]];
+        sum += context->conv_tables [3] [sp->delay [3] = sp->delay [4]];
+        sum += context->conv_tables [4] [sp->delay [4] = sp->delay [5]];
+        sum += context->conv_tables [5] [sp->delay [5] = sp->delay [6]];
+        sum += context->conv_tables [6] [sp->delay [6] = sp->delay [7]];
+        sum += context->conv_tables [7] [sp->delay [7] = sp->delay [8]];
+        sum += context->conv_tables [8] [sp->delay [8] = sp->delay [9]];
+        sum += context->conv_tables [9] [sp->delay [9] = *samples];
+#elif (HISTORY_BYTES == 7)
+        sum += context->conv_tables [0] [sp->delay [0] = sp->delay [1]];
+        sum += context->conv_tables [1] [sp->delay [1] = sp->delay [2]];
+        sum += context->conv_tables [2] [sp->delay [2] = sp->delay [3]];
+        sum += context->conv_tables [3] [sp->delay [3] = sp->delay [4]];
+        sum += context->conv_tables [4] [sp->delay [4] = sp->delay [5]];
+        sum += context->conv_tables [5] [sp->delay [5] = sp->delay [6]];
+        sum += context->conv_tables [6] [sp->delay [6] = *samples];
+#else
+        int i;
+
+        for (i = 0; i < HISTORY_BYTES-1; ++i)
+            sum += context->conv_tables [i] [sp->delay [i] = sp->delay [i+1]];
+
+        sum += context->conv_tables [i] [sp->delay [i] = *samples];
+#endif
+
+        *samples++ = sum >> 4;
+
+        if (++chan == context->num_channels) {
+            num_samples--;
+            chan = 0;
+        }
+    }
+}
+
+void decimate_dsd_destroy (void *decimate_context)
+{
+    DecimationContext *context = (DecimationContext *) decimate_context;
+
+    if (!context)
+        return;
+
+    if (context->chans)
+        free (context->chans);
+
+    free (context);
+}
+
+#endif      // ENABLE_DSD
diff --git a/third_party/wavpack/src/unpack_floats.c b/third_party/wavpack/src/unpack_floats.c
new file mode 100644
index 0000000..cc045dd
--- /dev/null
+++ b/third_party/wavpack/src/unpack_floats.c
@@ -0,0 +1,134 @@
+////////////////////////////////////////////////////////////////////////////
+//                           **** WAVPACK ****                            //
+//                  Hybrid Lossless Wavefile Compressor                   //
+//              Copyright (c) 1998 - 2013 Conifer Software.               //
+//                          All Rights Reserved.                          //
+//      Distributed under the BSD Software License (see license.txt)      //
+////////////////////////////////////////////////////////////////////////////
+
+// unpack_floats.c
+
+// This module deals with the restoration of floating-point data. Note that no
+// floating point math is involved here...the values are only processed with
+// the macros that directly access the mantissa, exponent, and sign fields.
+// That's why we use the f32 type instead of the built-in float type.
+
+#include <stdlib.h>
+
+#include "wavpack_local.h"
+
+static void float_values_nowvx (WavpackStream *wps, int32_t *values, int32_t num_values);
+
+void float_values (WavpackStream *wps, int32_t *values, int32_t num_values)
+{
+    uint32_t crc = wps->crc_x;
+
+    if (!bs_is_open (&wps->wvxbits)) {
+        float_values_nowvx (wps, values, num_values);
+        return;
+    }
+
+    while (num_values--) {
+        int shift_count = 0, exp = wps->float_max_exp;
+        f32 outval = 0;
+        uint32_t temp;
+
+        if (*values == 0) {
+            if (wps->float_flags & FLOAT_ZEROS_SENT) {
+                if (getbit (&wps->wvxbits)) {
+                    getbits (&temp, 23, &wps->wvxbits);
+                    set_mantissa (outval, temp);
+
+                    if (exp >= 25) {
+                        getbits (&temp, 8, &wps->wvxbits);
+                        set_exponent (outval, temp);
+                    }
+
+                    set_sign (outval, getbit (&wps->wvxbits));
+                }
+                else if (wps->float_flags & FLOAT_NEG_ZEROS)
+                    set_sign (outval, getbit (&wps->wvxbits));
+            }
+        }
+        else {
+            *values <<= wps->float_shift;
+
+            if (*values < 0) {
+                *values = -*values;
+                set_sign (outval, 1);
+            }
+
+            if (*values == 0x1000000) {
+                if (getbit (&wps->wvxbits)) {
+                    getbits (&temp, 23, &wps->wvxbits);
+                    set_mantissa (outval, temp);
+                }
+
+                set_exponent (outval, 255);
+            }
+            else {
+                if (exp)
+                    while (!(*values & 0x800000) && --exp) {
+                        shift_count++;
+                        *values <<= 1;
+                    }
+
+                if (shift_count) {
+                    if ((wps->float_flags & FLOAT_SHIFT_ONES) ||
+                        ((wps->float_flags & FLOAT_SHIFT_SAME) && getbit (&wps->wvxbits)))
+                            *values |= ((1 << shift_count) - 1);
+                    else if (wps->float_flags & FLOAT_SHIFT_SENT) {
+                        getbits (&temp, shift_count, &wps->wvxbits);
+                        *values |= temp & ((1 << shift_count) - 1);
+                    }
+                }
+
+                set_mantissa (outval, *values);
+                set_exponent (outval, exp);
+            }
+        }
+
+        crc = crc * 27 + get_mantissa (outval) * 9 + get_exponent (outval) * 3 + get_sign (outval);
+        * (f32 *) values++ = outval;
+    }
+
+    wps->crc_x = crc;
+}
+
+static void float_values_nowvx (WavpackStream *wps, int32_t *values, int32_t num_values)
+{
+    while (num_values--) {
+        int shift_count = 0, exp = wps->float_max_exp;
+        f32 outval = 0;
+
+        if (*values) {
+            *values <<= wps->float_shift;
+
+            if (*values < 0) {
+                *values = -*values;
+                set_sign (outval, 1);
+            }
+
+            if (*values >= 0x1000000) {
+                while (*values & 0xf000000) {
+                    *values >>= 1;
+                    ++exp;
+                }
+            }
+            else if (exp) {
+                while (!(*values & 0x800000) && --exp) {
+                    shift_count++;
+                    *values <<= 1;
+                }
+
+                if (shift_count && (wps->float_flags & FLOAT_SHIFT_ONES))
+                    *values |= ((1 << shift_count) - 1);
+            }
+
+            set_mantissa (outval, *values);
+            set_exponent (outval, exp);
+        }
+
+        * (f32 *) values++ = outval;
+    }
+}
diff --git a/third_party/wavpack/src/unpack_seek.c b/third_party/wavpack/src/unpack_seek.c
new file mode 100644
index 0000000..f3ab081
--- /dev/null
+++ b/third_party/wavpack/src/unpack_seek.c
@@ -0,0 +1,375 @@
+////////////////////////////////////////////////////////////////////////////
+//                           **** WAVPACK ****                            //
+//                  Hybrid Lossless Wavefile Compressor                   //
+//              Copyright (c) 1998 - 2013 Conifer Software.               //
+//                          All Rights Reserved.                          //
+//      Distributed under the BSD Software License (see license.txt)      //
+////////////////////////////////////////////////////////////////////////////
+
+// unpack_seek.c
+
+// This module provides the high-level API for unpacking audio data from
+// a specific sample index (i.e., seeking).
+
+#ifndef NO_SEEKING
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "wavpack_local.h"
+
+///////////////////////////// executable code ////////////////////////////////
+
+static int64_t find_sample (WavpackContext *wpc, void *infile, int64_t header_pos, int64_t sample);
+
+// Seek to the specifed sample index, returning TRUE on success. Note that
+// files generated with version 4.0 or newer will seek almost immediately.
+// Older files can take quite long if required to seek through unplayed
+// portions of the file, but will create a seek map so that reverse seeks
+// (or forward seeks to already scanned areas) will be very fast. After a
+// FALSE return the file should not be accessed again (other than to close
+// it); this is a fatal error.
+
+int WavpackSeekSample (WavpackContext *wpc, uint32_t sample)
+{
+    return WavpackSeekSample64 (wpc, sample);
+}
+
+int WavpackSeekSample64 (WavpackContext *wpc, int64_t sample)
+{
+    WavpackStream *wps = wpc->streams ? wpc->streams [wpc->current_stream = 0] : NULL;
+    uint32_t bcount, samples_to_skip, samples_to_decode = 0;
+    int32_t *buffer;
+
+    if (wpc->total_samples == -1 || sample >= wpc->total_samples ||
+        !wpc->reader->can_seek (wpc->wv_in) || (wpc->open_flags & OPEN_STREAMING) ||
+        (wpc->wvc_flag && !wpc->reader->can_seek (wpc->wvc_in)))
+            return FALSE;
+
+#ifdef ENABLE_LEGACY
+    if (wpc->stream3)
+        return seek_sample3 (wpc, (uint32_t) sample);
+#endif
+
+#ifdef ENABLE_DSD
+    if (wpc->decimation_context) {      // the decimation code needs some context to be sample accurate
+        if (sample < 16) {
+            samples_to_decode = (uint32_t) sample;
+            sample = 0;
+        }
+        else {
+            samples_to_decode = 16;
+            sample -= 16;
+        }
+    }
+#endif
+
+    if (!wps->wphdr.block_samples || !(wps->wphdr.flags & INITIAL_BLOCK) || sample < GET_BLOCK_INDEX (wps->wphdr) ||
+        sample >= GET_BLOCK_INDEX (wps->wphdr) + wps->wphdr.block_samples) {
+
+            free_streams (wpc);
+            wpc->filepos = find_sample (wpc, wpc->wv_in, wpc->filepos, sample);
+
+            if (wpc->filepos == -1)
+                return FALSE;
+
+            if (wpc->wvc_flag) {
+                wpc->file2pos = find_sample (wpc, wpc->wvc_in, 0, sample);
+
+                if (wpc->file2pos == -1)
+                    return FALSE;
+            }
+    }
+
+    if (!wps->blockbuff) {
+        wpc->reader->set_pos_abs (wpc->wv_in, wpc->filepos);
+        wpc->reader->read_bytes (wpc->wv_in, &wps->wphdr, sizeof (WavpackHeader));
+        WavpackLittleEndianToNative (&wps->wphdr, WavpackHeaderFormat);
+        wps->blockbuff = (unsigned char *)malloc (wps->wphdr.ckSize + 8);
+        memcpy (wps->blockbuff, &wps->wphdr, sizeof (WavpackHeader));
+
+        if (wpc->reader->read_bytes (wpc->wv_in, wps->blockbuff + sizeof (WavpackHeader), wps->wphdr.ckSize - 24) !=
+            wps->wphdr.ckSize - 24) {
+                free_streams (wpc);
+                return FALSE;
+        }
+
+        // render corrupt blocks harmless
+        if (!WavpackVerifySingleBlock (wps->blockbuff, !(wpc->open_flags & OPEN_NO_CHECKSUM))) {
+            wps->wphdr.ckSize = sizeof (WavpackHeader) - 8;
+            wps->wphdr.block_samples = 0;
+            memcpy (wps->blockbuff, &wps->wphdr, 32);
+        }
+
+        SET_BLOCK_INDEX (wps->wphdr, GET_BLOCK_INDEX (wps->wphdr) - wpc->initial_index);
+        memcpy (wps->blockbuff, &wps->wphdr, sizeof (WavpackHeader));
+        wps->init_done = FALSE;
+
+        if (wpc->wvc_flag) {
+            wpc->reader->set_pos_abs (wpc->wvc_in, wpc->file2pos);
+            wpc->reader->read_bytes (wpc->wvc_in, &wps->wphdr, sizeof (WavpackHeader));
+            WavpackLittleEndianToNative (&wps->wphdr, WavpackHeaderFormat);
+            wps->block2buff = (unsigned char *)malloc (wps->wphdr.ckSize + 8);
+            memcpy (wps->block2buff, &wps->wphdr, sizeof (WavpackHeader));
+
+            if (wpc->reader->read_bytes (wpc->wvc_in, wps->block2buff + sizeof (WavpackHeader), wps->wphdr.ckSize - 24) !=
+                wps->wphdr.ckSize - 24) {
+                    free_streams (wpc);
+                    return FALSE;
+            }
+
+            // render corrupt blocks harmless
+            if (!WavpackVerifySingleBlock (wps->block2buff, !(wpc->open_flags & OPEN_NO_CHECKSUM))) {
+                wps->wphdr.ckSize = sizeof (WavpackHeader) - 8;
+                wps->wphdr.block_samples = 0;
+                memcpy (wps->block2buff, &wps->wphdr, 32);
+            }
+
+            SET_BLOCK_INDEX (wps->wphdr, GET_BLOCK_INDEX (wps->wphdr) - wpc->initial_index);
+            memcpy (wps->block2buff, &wps->wphdr, sizeof (WavpackHeader));
+        }
+
+        if (!wps->init_done && !unpack_init (wpc)) {
+            free_streams (wpc);
+            return FALSE;
+        }
+
+        wps->init_done = TRUE;
+    }
+
+    while (!wpc->reduced_channels && !(wps->wphdr.flags & FINAL_BLOCK)) {
+        if (++wpc->current_stream == wpc->num_streams) {
+
+            if (wpc->num_streams == wpc->max_streams) {
+                free_streams (wpc);
+                return FALSE;
+            }
+
+            wpc->streams = (WavpackStream **)realloc (wpc->streams, (wpc->num_streams + 1) * sizeof (wpc->streams [0]));
+            wps = wpc->streams [wpc->num_streams++] = (WavpackStream *)malloc (sizeof (WavpackStream));
+            CLEAR (*wps);
+            bcount = read_next_header (wpc->reader, wpc->wv_in, &wps->wphdr);
+
+            if (bcount == (uint32_t) -1) {
+                free_streams (wpc);
+                return FALSE;
+            }
+
+            wps->blockbuff = (unsigned char *)malloc (wps->wphdr.ckSize + 8);
+            memcpy (wps->blockbuff, &wps->wphdr, 32);
+
+            if (wpc->reader->read_bytes (wpc->wv_in, wps->blockbuff + 32, wps->wphdr.ckSize - 24) !=
+                wps->wphdr.ckSize - 24) {
+                    free_streams (wpc);
+                    return FALSE;
+            }
+
+            // render corrupt blocks harmless
+            if (!WavpackVerifySingleBlock (wps->blockbuff, !(wpc->open_flags & OPEN_NO_CHECKSUM))) {
+                wps->wphdr.ckSize = sizeof (WavpackHeader) - 8;
+                wps->wphdr.block_samples = 0;
+                memcpy (wps->blockbuff, &wps->wphdr, 32);
+            }
+
+            wps->init_done = FALSE;
+
+            if (wpc->wvc_flag && !read_wvc_block (wpc)) {
+                free_streams (wpc);
+                return FALSE;
+            }
+
+            if (!wps->init_done && !unpack_init (wpc)) {
+                free_streams (wpc);
+                return FALSE;
+            }
+
+            wps->init_done = TRUE;
+        }
+        else
+            wps = wpc->streams [wpc->current_stream];
+    }
+
+    if (sample < wps->sample_index) {
+        for (wpc->current_stream = 0; wpc->current_stream < wpc->num_streams; wpc->current_stream++)
+            if (!unpack_init (wpc))
+                return FALSE;
+            else
+                wpc->streams [wpc->current_stream]->init_done = TRUE;
+    }
+
+    samples_to_skip = (uint32_t) (sample - wps->sample_index);
+
+    if (samples_to_skip > 131072) {
+        free_streams (wpc);
+        return FALSE;
+    }
+
+    if (samples_to_skip) {
+        buffer = (int32_t *)malloc (samples_to_skip * 8);
+
+        for (wpc->current_stream = 0; wpc->current_stream < wpc->num_streams; wpc->current_stream++)
+#ifdef ENABLE_DSD
+            if (wpc->streams [wpc->current_stream]->wphdr.flags & DSD_FLAG)
+                unpack_dsd_samples (wpc, buffer, samples_to_skip);
+            else
+#endif
+                unpack_samples (wpc, buffer, samples_to_skip);
+
+        free (buffer);
+    }
+
+    wpc->current_stream = 0;
+
+#ifdef ENABLE_DSD
+    if (wpc->decimation_context)
+        decimate_dsd_reset (wpc->decimation_context);
+
+    if (samples_to_decode) {
+        buffer = (int32_t *)malloc (samples_to_decode * wpc->config.num_channels * 4);
+
+        if (buffer) {
+            WavpackUnpackSamples (wpc, buffer, samples_to_decode);
+            free (buffer);
+        }
+    }
+#endif
+
+    return TRUE;
+}
+
+// Find a valid WavPack header, searching either from the current file position
+// (or from the specified position if not -1) and store it (endian corrected)
+// at the specified pointer. The return value is the exact file position of the
+// header, although we may have actually read past it. Because this function
+// is used for seeking to a specific audio sample, it only considers blocks
+// that contain audio samples for the initial stream to be valid.
+
+#define BUFSIZE 4096
+
+static int64_t find_header (WavpackStreamReader64 *reader, void *id, int64_t filepos, WavpackHeader *wphdr)
+{
+    unsigned char *buffer = (unsigned char *)malloc (BUFSIZE), *sp = buffer, *ep = buffer;
+
+    if (filepos != (uint32_t) -1 && reader->set_pos_abs (id, filepos)) {
+        free (buffer);
+        return -1;
+    }
+
+    while (1) {
+        int bleft;
+
+        if (sp < ep) {
+            bleft = (int)(ep - sp);
+            memcpy (buffer, sp, bleft);
+            ep -= (sp - buffer);
+            sp = buffer;
+        }
+        else {
+            if (sp > ep)
+                if (reader->set_pos_rel (id, (int32_t)(sp - ep), SEEK_CUR)) {
+                    free (buffer);
+                    return -1;
+                }
+
+            sp = ep = buffer;
+            bleft = 0;
+        }
+
+        ep += reader->read_bytes (id, ep, BUFSIZE - bleft);
+
+        if (ep - sp < 32) {
+            free (buffer);
+            return -1;
+        }
+
+        while (sp + 32 <= ep)
+            if (*sp++ == 'w' && *sp == 'v' && *++sp == 'p' && *++sp == 'k' &&
+                !(*++sp & 1) && sp [2] < 16 && !sp [3] && (sp [2] || sp [1] || *sp >= 24) && sp [5] == 4 &&
+                sp [4] >= (MIN_STREAM_VERS & 0xff) && sp [4] <= (MAX_STREAM_VERS & 0xff) && sp [18] < 3 && !sp [19]) {
+                    memcpy (wphdr, sp - 4, sizeof (*wphdr));
+                    WavpackLittleEndianToNative (wphdr, WavpackHeaderFormat);
+
+                    if (wphdr->block_samples && (wphdr->flags & INITIAL_BLOCK)) {
+                        free (buffer);
+                        return reader->get_pos (id) - (ep - sp + 4);
+                    }
+
+                    if (wphdr->ckSize > 1024)
+                        sp += wphdr->ckSize - 1024;
+            }
+    }
+}
+
+// Find the WavPack block that contains the specified sample. If "header_pos"
+// is zero, then no information is assumed except the total number of samples
+// in the file and its size in bytes. If "header_pos" is non-zero then we
+// assume that it is the file position of the valid header image contained in
+// the first stream and we can limit our search to either the portion above
+// or below that point. If a .wvc file is being used, then this must be called
+// for that file also.
+
+static int64_t find_sample (WavpackContext *wpc, void *infile, int64_t header_pos, int64_t sample)
+{
+    WavpackStream *wps = wpc->streams [wpc->current_stream];
+    int64_t file_pos1 = 0, file_pos2 = wpc->reader->get_length (infile);
+    int64_t sample_pos1 = 0, sample_pos2 = wpc->total_samples;
+    double ratio = 0.96;
+    int file_skip = 0;
+
+    if (sample >= wpc->total_samples)
+        return -1;
+
+    if (header_pos && wps->wphdr.block_samples) {
+        if (GET_BLOCK_INDEX (wps->wphdr) > sample) {
+            sample_pos2 = GET_BLOCK_INDEX (wps->wphdr);
+            file_pos2 = header_pos;
+        }
+        else if (GET_BLOCK_INDEX (wps->wphdr) + wps->wphdr.block_samples <= sample) {
+            sample_pos1 = GET_BLOCK_INDEX (wps->wphdr);
+            file_pos1 = header_pos;
+        }
+        else
+            return header_pos;
+    }
+
+    while (1) {
+        double bytes_per_sample;
+        int64_t seek_pos;
+
+        bytes_per_sample = (double) file_pos2 - file_pos1;
+        bytes_per_sample /= sample_pos2 - sample_pos1;
+        seek_pos = file_pos1 + (file_skip ? 32 : 0);
+        seek_pos += (int64_t)(bytes_per_sample * (sample - sample_pos1) * ratio);
+        seek_pos = find_header (wpc->reader, infile, seek_pos, &wps->wphdr);
+
+        if (seek_pos != (int64_t) -1)
+            SET_BLOCK_INDEX (wps->wphdr, GET_BLOCK_INDEX (wps->wphdr) - wpc->initial_index);
+
+        if (seek_pos == (int64_t) -1 || seek_pos >= file_pos2) {
+            if (ratio > 0.0) {
+                if ((ratio -= 0.24) < 0.0)
+                    ratio = 0.0;
+            }
+            else
+                return -1;
+        }
+        else if (GET_BLOCK_INDEX (wps->wphdr) > sample) {
+            sample_pos2 = GET_BLOCK_INDEX (wps->wphdr);
+            file_pos2 = seek_pos;
+        }
+        else if (GET_BLOCK_INDEX (wps->wphdr) + wps->wphdr.block_samples <= sample) {
+
+            if (seek_pos == file_pos1)
+                file_skip = 1;
+            else {
+                sample_pos1 = GET_BLOCK_INDEX (wps->wphdr);
+                file_pos1 = seek_pos;
+            }
+        }
+        else
+            return seek_pos;
+    }
+}
+
+#endif
+
diff --git a/third_party/wavpack/src/unpack_utils.c b/third_party/wavpack/src/unpack_utils.c
new file mode 100644
index 0000000..ce7d7da
--- /dev/null
+++ b/third_party/wavpack/src/unpack_utils.c
@@ -0,0 +1,411 @@
+////////////////////////////////////////////////////////////////////////////
+//                           **** WAVPACK ****                            //
+//                  Hybrid Lossless Wavefile Compressor                   //
+//              Copyright (c) 1998 - 2013 Conifer Software.               //
+//                          All Rights Reserved.                          //
+//      Distributed under the BSD Software License (see license.txt)      //
+////////////////////////////////////////////////////////////////////////////
+
+// unpack_utils.c
+
+// This module provides the high-level API for unpacking audio data from
+// WavPack files. It manages the buffers used to interleave the data passed
+// back to the application from the individual streams. The actual audio
+// stream decompression is handled in the unpack.c module.
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "wavpack_local.h"
+
+///////////////////////////// executable code ////////////////////////////////
+
+// Unpack the specified number of samples from the current file position.
+// Note that "samples" here refers to "complete" samples, which would be
+// 2 longs for stereo files or even more for multichannel files, so the
+// required memory at "buffer" is 4 * samples * num_channels bytes. The
+// audio data is returned right-justified in 32-bit longs in the endian
+// mode native to the executing processor. So, if the original data was
+// 16-bit, then the values returned would be +/-32k. Floating point data
+// can also be returned if the source was floating point data (and this
+// can be optionally normalized to +/-1.0 by using the appropriate flag
+// in the call to WavpackOpenFileInput ()). The actual number of samples
+// unpacked is returned, which should be equal to the number requested unless
+// the end of fle is encountered or an error occurs. After all samples have
+// been unpacked then 0 will be returned.
+
+uint32_t WavpackUnpackSamples (WavpackContext *wpc, int32_t *buffer, uint32_t samples)
+{
+    WavpackStream *wps = wpc->streams ? wpc->streams [wpc->current_stream = 0] : NULL;
+    int num_channels = wpc->config.num_channels, file_done = FALSE;
+    uint32_t bcount, samples_unpacked = 0, samples_to_unpack;
+    int32_t *bptr = buffer;
+
+#ifdef ENABLE_LEGACY
+    if (wpc->stream3)
+        return unpack_samples3 (wpc, buffer, samples);
+#endif
+
+    while (samples) {
+
+        // if the current block has no audio, or it's not the first block of a multichannel
+        // sequence, or the sample we're on is past the last sample in this block...we need
+        // to free up the streams and read the next block
+
+        if (!wps->wphdr.block_samples || !(wps->wphdr.flags & INITIAL_BLOCK) ||
+            wps->sample_index >= GET_BLOCK_INDEX (wps->wphdr) + wps->wphdr.block_samples) {
+
+                int64_t nexthdrpos;
+
+                if (wpc->wrapper_bytes >= MAX_WRAPPER_BYTES)
+                    break;
+
+                free_streams (wpc);
+                nexthdrpos = wpc->reader->get_pos (wpc->wv_in);
+                bcount = read_next_header (wpc->reader, wpc->wv_in, &wps->wphdr);
+
+                if (bcount == (uint32_t) -1)
+                    break;
+
+                wpc->filepos = nexthdrpos + bcount;
+
+                // allocate the memory for the entire raw block and read it in
+
+                wps->blockbuff = (unsigned char *)malloc (wps->wphdr.ckSize + 8);
+
+                if (!wps->blockbuff)
+                    break;
+
+                memcpy (wps->blockbuff, &wps->wphdr, 32);
+
+                if (wpc->reader->read_bytes (wpc->wv_in, wps->blockbuff + 32, wps->wphdr.ckSize - 24) !=
+                    wps->wphdr.ckSize - 24) {
+                        strcpy (wpc->error_message, "can't read all of last block!");
+                        wps->wphdr.block_samples = 0;
+                        wps->wphdr.ckSize = 24;
+                        break;
+                }
+
+                // render corrupt blocks harmless
+                if (!WavpackVerifySingleBlock (wps->blockbuff, !(wpc->open_flags & OPEN_NO_CHECKSUM))) {
+                    wps->wphdr.ckSize = sizeof (WavpackHeader) - 8;
+                    wps->wphdr.block_samples = 0;
+                    memcpy (wps->blockbuff, &wps->wphdr, 32);
+                }
+
+                // potentially adjusting block_index must be done AFTER verifying block
+
+                if (wpc->open_flags & OPEN_STREAMING)
+                    SET_BLOCK_INDEX (wps->wphdr, wps->sample_index = 0);
+                else
+                    SET_BLOCK_INDEX (wps->wphdr, GET_BLOCK_INDEX (wps->wphdr) - wpc->initial_index);
+
+                memcpy (wps->blockbuff, &wps->wphdr, 32);
+                wps->init_done = FALSE;     // we have not yet called unpack_init() for this block
+
+                // if this block has audio, but not the sample index we were expecting, flag an error
+
+                if (wps->wphdr.block_samples && wps->sample_index != GET_BLOCK_INDEX (wps->wphdr))
+                    wpc->crc_errors++;
+
+                // if this block has audio, and we're in hybrid lossless mode, read the matching wvc block
+
+                if (wps->wphdr.block_samples && wpc->wvc_flag)
+                    read_wvc_block (wpc);
+
+                // if the block does NOT have any audio, call unpack_init() to process non-audio stuff
+
+                if (!wps->wphdr.block_samples) {
+                    if (!wps->init_done && !unpack_init (wpc))
+                        wpc->crc_errors++;
+
+                    wps->init_done = TRUE;
+                }
+        }
+
+        // if the current block has no audio, or it's not the first block of a multichannel
+        // sequence, or the sample we're on is past the last sample in this block...we need
+        // to loop back and read the next block
+
+        if (!wps->wphdr.block_samples || !(wps->wphdr.flags & INITIAL_BLOCK) ||
+            wps->sample_index >= GET_BLOCK_INDEX (wps->wphdr) + wps->wphdr.block_samples)
+                continue;
+
+        // There seems to be some missing data, like a block was corrupted or something.
+        // If it's not too much data, just fill in with silence here and loop back.
+
+        if (wps->sample_index < GET_BLOCK_INDEX (wps->wphdr)) {
+            int32_t zvalue = (wps->wphdr.flags & DSD_FLAG) ? 0x55 : 0;
+
+            samples_to_unpack = (uint32_t) (GET_BLOCK_INDEX (wps->wphdr) - wps->sample_index);
+
+            if (!samples_to_unpack || samples_to_unpack > 262144) {
+                strcpy (wpc->error_message, "discontinuity found, aborting file!");
+                wps->wphdr.block_samples = 0;
+                wps->wphdr.ckSize = 24;
+                break;
+            }
+
+            if (samples_to_unpack > samples)
+                samples_to_unpack = samples;
+
+            wps->sample_index += samples_to_unpack;
+            samples_unpacked += samples_to_unpack;
+            samples -= samples_to_unpack;
+
+            samples_to_unpack *= (wpc->reduced_channels ? wpc->reduced_channels : num_channels);
+
+            while (samples_to_unpack--)
+                *bptr++ = zvalue;
+
+            continue;
+        }
+
+        // calculate number of samples to process from this block, then initialize the decoder for
+        // this block if we haven't already
+
+        samples_to_unpack = (uint32_t) (GET_BLOCK_INDEX (wps->wphdr) + wps->wphdr.block_samples - wps->sample_index);
+
+        if (samples_to_unpack > samples)
+            samples_to_unpack = samples;
+
+        if (!wps->init_done && !unpack_init (wpc))
+            wpc->crc_errors++;
+
+        wps->init_done = TRUE;
+
+        // if this block is not the final block of a multichannel sequence (and we're not truncating
+        // to stereo), then enter this conditional block...otherwise we just unpack the samples directly
+
+        if (!wpc->reduced_channels && !(wps->wphdr.flags & FINAL_BLOCK)) {
+            int32_t *temp_buffer = (int32_t *)malloc (samples_to_unpack * 8), *src, *dst;
+            int offset = 0;     // offset to next channel in sequence (0 to num_channels - 1)
+            uint32_t samcnt;
+
+            // since we are getting samples from multiple bocks in a multichannel sequence, we must
+            // allocate a temporary buffer to unpack to so that we can re-interleave the samples
+
+	    if (!temp_buffer)
+		break;
+
+            // loop through all the streams...
+
+            while (1) {
+
+                // if the stream has not been allocated and corresponding block read, do that here...
+
+                if (wpc->current_stream == wpc->num_streams) {
+                    wpc->streams = (WavpackStream **)realloc (wpc->streams, (wpc->num_streams + 1) * sizeof (wpc->streams [0]));
+
+                    if (!wpc->streams)
+			break;
+
+                    wps = wpc->streams [wpc->num_streams++] = (WavpackStream *)malloc (sizeof (WavpackStream));
+
+                    if (!wps)
+			break;
+
+                    CLEAR (*wps);
+                    bcount = read_next_header (wpc->reader, wpc->wv_in, &wps->wphdr);
+
+                    if (bcount == (uint32_t) -1) {
+                        wpc->streams [0]->wphdr.block_samples = 0;
+                        wpc->streams [0]->wphdr.ckSize = 24;
+                        file_done = TRUE;
+                        break;
+                    }
+
+                    wps->blockbuff = (unsigned char *)malloc (wps->wphdr.ckSize + 8);
+
+                    if (!wps->blockbuff)
+		        break;
+
+                    memcpy (wps->blockbuff, &wps->wphdr, 32);
+
+                    if (wpc->reader->read_bytes (wpc->wv_in, wps->blockbuff + 32, wps->wphdr.ckSize - 24) !=
+                        wps->wphdr.ckSize - 24) {
+                            wpc->streams [0]->wphdr.block_samples = 0;
+                            wpc->streams [0]->wphdr.ckSize = 24;
+                            file_done = TRUE;
+                            break;
+                    }
+
+                    // render corrupt blocks harmless
+                    if (!WavpackVerifySingleBlock (wps->blockbuff, !(wpc->open_flags & OPEN_NO_CHECKSUM))) {
+                        wps->wphdr.ckSize = sizeof (WavpackHeader) - 8;
+                        wps->wphdr.block_samples = 0;
+                        memcpy (wps->blockbuff, &wps->wphdr, 32);
+                    }
+
+                    // potentially adjusting block_index must be done AFTER verifying block
+
+                    if (wpc->open_flags & OPEN_STREAMING)
+                        SET_BLOCK_INDEX (wps->wphdr, wps->sample_index = 0);
+                    else
+                        SET_BLOCK_INDEX (wps->wphdr, GET_BLOCK_INDEX (wps->wphdr) - wpc->initial_index);
+
+                    memcpy (wps->blockbuff, &wps->wphdr, 32);
+
+                    // if this block has audio, and we're in hybrid lossless mode, read the matching wvc block
+
+                    if (wpc->wvc_flag)
+                        read_wvc_block (wpc);
+
+                    // initialize the unpacker for this block
+
+                    if (!unpack_init (wpc))
+                        wpc->crc_errors++;
+
+                    wps->init_done = TRUE;
+                }
+                else
+                    wps = wpc->streams [wpc->current_stream];
+
+                // unpack the correct number of samples (either mono or stereo) into the temp buffer
+
+#ifdef ENABLE_DSD
+                if (wps->wphdr.flags & DSD_FLAG)
+                    unpack_dsd_samples (wpc, src = temp_buffer, samples_to_unpack);
+                else
+#endif
+                    unpack_samples (wpc, src = temp_buffer, samples_to_unpack);
+
+                samcnt = samples_to_unpack;
+                dst = bptr + offset;
+
+                // if the block is mono, copy the samples from the single channel into the destination
+                // using num_channels as the stride
+
+                if (wps->wphdr.flags & MONO_FLAG) {
+                    while (samcnt--) {
+                        dst [0] = *src++;
+                        dst += num_channels;
+                    }
+
+                    offset++;
+                }
+
+                // if the block is stereo, and we don't have room for two more channels, just copy one
+                // and flag an error
+
+                else if (offset == num_channels - 1) {
+                    while (samcnt--) {
+                        dst [0] = src [0];
+                        dst += num_channels;
+                        src += 2;
+                    }
+
+                    wpc->crc_errors++;
+                    offset++;
+                }
+
+                // otherwise copy the stereo samples into the destination
+
+                else {
+                    while (samcnt--) {
+                        dst [0] = *src++;
+                        dst [1] = *src++;
+                        dst += num_channels;
+                    }
+
+                    offset += 2;
+                }
+
+                // check several clues that we're done with this set of blocks and exit if we are; else do next stream
+
+                if ((wps->wphdr.flags & FINAL_BLOCK) || wpc->current_stream == wpc->max_streams - 1 || offset == num_channels)
+                    break;
+                else
+                    wpc->current_stream++;
+            }
+
+            // if we didn't get all the channels we expected, mute the buffer and flag an error
+
+            if (offset != num_channels) {
+                if (wps->wphdr.flags & DSD_FLAG) {
+                    int samples_to_zero = samples_to_unpack * num_channels;
+                    int32_t *zptr = bptr;
+
+                    while (samples_to_zero--)
+                        *zptr++ = 0x55;
+                }
+                else
+                    memset (bptr, 0, samples_to_unpack * num_channels * 4);
+
+                wpc->crc_errors++;
+            }
+
+            // go back to the first stream (we're going to leave them all loaded for now because they might have more samples)
+            // and free the temp buffer
+
+            wps = wpc->streams [wpc->current_stream = 0];
+            free (temp_buffer);
+        }
+        // catch the error situation where we have only one channel but run into a stereo block
+        // (this avoids overwriting the caller's buffer)
+        else if (!(wps->wphdr.flags & MONO_FLAG) && (num_channels == 1 || wpc->reduced_channels == 1)) {
+            memset (bptr, 0, samples_to_unpack * sizeof (*bptr));
+            wps->sample_index += samples_to_unpack;
+            wpc->crc_errors++;
+        }
+#ifdef ENABLE_DSD
+        else if (wps->wphdr.flags & DSD_FLAG)
+            unpack_dsd_samples (wpc, bptr, samples_to_unpack);
+#endif
+        else
+            unpack_samples (wpc, bptr, samples_to_unpack);
+
+        if (file_done) {
+            strcpy (wpc->error_message, "can't read all of last block!");
+            break;
+        }
+
+        if (wpc->reduced_channels)
+            bptr += samples_to_unpack * wpc->reduced_channels;
+        else
+            bptr += samples_to_unpack * num_channels;
+
+        samples_unpacked += samples_to_unpack;
+        samples -= samples_to_unpack;
+
+        // if we just finished a block, check for a calculated crc error
+        // (and back up the streams a little if possible in case we passed a header)
+
+        if (wps->sample_index == GET_BLOCK_INDEX (wps->wphdr) + wps->wphdr.block_samples) {
+            if (check_crc_error (wpc)) {
+                int32_t *zptr = bptr, zvalue = (wps->wphdr.flags & DSD_FLAG) ? 0x55 : 0;
+                uint32_t samples_to_zero = wps->wphdr.block_samples;
+
+                if (samples_to_zero > samples_to_unpack)
+                    samples_to_zero = samples_to_unpack;
+
+                samples_to_zero *= (wpc->reduced_channels ? wpc->reduced_channels : num_channels);
+
+                while (samples_to_zero--)
+                    *--zptr = zvalue;
+
+                if (wps->blockbuff && wpc->reader->can_seek (wpc->wv_in)) {
+                    int32_t rseek = ((WavpackHeader *) wps->blockbuff)->ckSize / 3;
+                    wpc->reader->set_pos_rel (wpc->wv_in, (rseek > 16384) ? -16384 : -rseek, SEEK_CUR);
+                }
+
+                if (wpc->wvc_flag && wps->block2buff && wpc->reader->can_seek (wpc->wvc_in)) {
+                    int32_t rseek = ((WavpackHeader *) wps->block2buff)->ckSize / 3;
+                    wpc->reader->set_pos_rel (wpc->wvc_in, (rseek > 16384) ? -16384 : -rseek, SEEK_CUR);
+                }
+
+                wpc->crc_errors++;
+            }
+        }
+
+        if (wpc->total_samples != -1 && wps->sample_index == wpc->total_samples)
+            break;
+    }
+
+#ifdef ENABLE_DSD
+    if (wpc->decimation_context)
+        decimate_dsd_run (wpc->decimation_context, buffer, samples_unpacked);
+#endif
+
+    return samples_unpacked;
+}
diff --git a/third_party/wavpack/src/unpack_x64.S b/third_party/wavpack/src/unpack_x64.S
new file mode 100644
index 0000000..f9657cf
--- /dev/null
+++ b/third_party/wavpack/src/unpack_x64.S
@@ -0,0 +1,957 @@
+############################################################################
+##                           **** WAVPACK ****                            ##
+##                  Hybrid Lossless Wavefile Compressor                   ##
+##              Copyright (c) 1998 - 2015 Conifer Software.               ##
+##                          All Rights Reserved.                          ##
+##      Distributed under the BSD Software License (see license.txt)      ##
+############################################################################
+
+        .intel_syntax noprefix
+        .text
+
+        .globl  _unpack_decorr_stereo_pass_cont_x64win
+        .globl  _unpack_decorr_mono_pass_cont_x64win
+
+        .globl  unpack_decorr_stereo_pass_cont_x64win
+        .globl  unpack_decorr_mono_pass_cont_x64win
+
+        .globl  _unpack_decorr_stereo_pass_cont_x64
+        .globl  _unpack_decorr_mono_pass_cont_x64
+
+        .globl  unpack_decorr_stereo_pass_cont_x64
+        .globl  unpack_decorr_mono_pass_cont_x64
+
+# This is an assembly optimized version of the following WavPack function:
+#
+# void unpack_decorr_stereo_pass_cont (struct decorr_pass *dpp,
+#                                      int32_t *buffer,
+#                                      int32_t sample_count,
+#                                      int32_t long_math;
+#
+# It performs a single pass of stereo decorrelation on the provided buffer.
+# Note that this version of the function requires that up to 8 previous
+# stereo samples are visible and correct. In other words, it ignores the
+# "samples_*" fields in the decorr_pass structure and gets the history data
+# directly from the buffer. It does, however, return the appropriate history
+# samples to the decorr_pass structure before returning.
+#
+# The "long_math" argument is used to specify that a 32-bit multiply is
+# not enough for the "apply_weight" operation (although in this case it
+# would only apply to the -1 and -2 terms because the MMX code does not have
+# this limitation) but we ignore the parameter and use the overflow detection
+# of the "imul" instruction to switch automatically to the "long_math" loop.
+#
+# This is written to work on an X86-64 processor (also called the AMD64)
+# running in 64-bit mode and generally uses the MMX extensions to improve
+# the performance by processing both stereo channels together. Unfortunately
+# this is not easily used for terms -1 and -2, so these terms are handled
+# sequentially with regular assembler code.
+#
+# This version has entry points for both the System V ABI and the Windows
+# X64 ABI. It does not use the "red zone" or the "shadow area"; it saves the
+# non-volatile registers for both ABIs on the stack and allocates another
+# 8 bytes on the stack to store the dpp pointer. Note that it does NOT
+# provide unwind data for the Windows ABI (the unpack_x64.asm module for
+# MSVC does). The arguments are passed in registers:
+#
+# System V  Windows  
+#   rdi       rcx      struct decorr_pass *dpp
+#   rsi       rdx      int32_t *buffer
+#   edx       r8       int32_t sample_count
+#   ecx       r9       int32_t long_math
+#
+# registers after entry:
+#
+#   rdi         bptr
+#   rsi         eptr
+#
+# stack usage:
+#
+# [rsp+0] = *dpp
+#
+
+_unpack_decorr_stereo_pass_cont_x64win:
+unpack_decorr_stereo_pass_cont_x64win:
+        push    rbp
+        push    rbx
+        push    rdi
+        push    rsi
+        sub     rsp, 8
+        mov     rdi, rcx                    # copy params from win regs to Linux regs
+        mov     rsi, rdx                    # so we can leave following code similar
+        mov     rdx, r8
+        mov     rcx, r9
+        jmp     entry                       # jump into common portion
+
+_unpack_decorr_stereo_pass_cont_x64:
+unpack_decorr_stereo_pass_cont_x64:
+        push    rbp
+        push    rbx
+        push    rdi
+        push    rsi
+        sub     rsp, 8
+
+entry:  mov     [rsp], rdi                  # store dpp* at [rsp]
+        and     edx, edx                    # if sample_count is zero, do nothing
+        jz      done
+
+        mov     rdi, rsi                    # rdi = bptr
+        lea     rsi, [rdi+rdx*8]            # rsi = eptr
+
+        mov     rax, [rsp]                  # get term from dpp struct & vector to handler
+        mov     eax, [rax]
+        cmp     al, 17
+        je      term_17_entry
+        cmp     al, 18
+        je      term_18_entry
+        cmp     al, -1
+        je      term_minus_1_entry
+        cmp     al, -2
+        je      term_minus_2_entry
+        cmp     al, -3
+        je      term_minus_3_entry
+
+#
+# registers in default term loop:
+#
+#   rbx         term * -8 (for indexing correlation sample)
+#   rdi         bptr
+#   rsi         eptr
+#
+#   mm0, mm1    scratch
+#   mm2         original sample values
+#   mm3         correlation sample
+#   mm4         zero (for pcmpeqd)
+#   mm5         weights
+#   mm6         delta
+#   mm7         512 (for rounding)
+#
+
+default_term_entry:
+        imul    rbx, rax, -8                # set RBX to term * -8
+        mov     eax, 512
+        movd    mm7, eax
+        punpckldq mm7, mm7                  # mm7 = round (512)
+        mov     rdx, [rsp]                  # set RDX to *dpp
+        mov     eax, [rdx+4]
+        movd    mm6, eax
+        punpckldq mm6, mm6                  # mm6 = delta (0-7)
+        mov     eax, 0xFFFF                 # mask high weights to zero for PMADDWD
+        movd    mm5, eax
+        punpckldq mm5, mm5                  # mm5 = weight mask 0x0000FFFF0000FFFF
+        pand    mm5, [rdx+8]                # mm5 = weight_AB masked to 16 bits
+        pxor    mm4, mm4                    # mm4 = zero (for pcmpeqd)
+        jmp     default_term_loop
+
+        .balign  64
+default_term_loop:
+        movq    mm3, [rdi+rbx]              # mm3 = sam_AB
+        movq    mm1, mm3
+        movq    mm0, mm3
+        paddd   mm1, mm1
+        psrld   mm0, 15
+        psrlw   mm1, 1
+        pmaddwd mm0, mm5
+        pmaddwd mm1, mm5
+        movq    mm2, [rdi]                  # mm2 = left_right
+        pslld   mm0, 5
+        paddd   mm1, mm7                    # add 512 for rounding
+        psrad   mm1, 10
+        paddd   mm0, mm2
+        paddd   mm0, mm1                    # add shifted sums
+        movq    [rdi], mm0                  # store result
+        movq    mm0, mm3
+        pxor    mm0, mm2
+        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
+        add     rdi, 8
+        pcmpeqd mm2, mm4                    # mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm4                    # mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    # mm2 = 1s if either was zero
+        pandn   mm2, mm6                    # mask delta with zeros check
+        pxor    mm5, mm0
+        paddw   mm5, mm2                    # and add to weight_AB
+        pxor    mm5, mm0
+        cmp     rdi, rsi                    # compare bptr and eptr to see if we're done
+        jb      default_term_loop
+
+        pslld   mm5, 16                     # sign-extend 16-bit weights back to dwords
+        psrad   mm5, 16
+        mov     rdx, [rsp]                  # point to dpp
+        movq    [rdx+8], mm5                # put weight_AB back
+        emms
+
+        mov     ecx, [rdx]                  # ecx = dpp->term
+
+default_store_samples:
+        dec     ecx
+        sub     rdi, 8                      # back up one full sample
+        mov     eax, [rdi+4]
+        mov     [rdx+rcx*4+48], eax         # store samples_B [ecx]
+        mov     eax, [rdi]
+        mov     [rdx+rcx*4+16], eax         # store samples_A [ecx]
+        test    ecx, ecx
+        jnz     default_store_samples
+        jmp     done
+
+#
+# registers in term 17 & 18 loops:
+#
+#   rdi         bptr
+#   rsi         eptr
+#
+#   mm0, mm1    scratch
+#   mm2         original sample values
+#   mm3         correlation samples
+#   mm4         last calculated values (so we don't need to reload)
+#   mm5         weights
+#   mm6         delta
+#   mm7         512 (for rounding)
+#
+
+term_17_entry:
+        mov     eax, 512
+        movd    mm7, eax
+        punpckldq mm7, mm7                  # mm7 = round (512)
+        mov     rdx, [rsp]                  # set RDX to *dpp
+        mov     eax, [rdx+4]
+        movd    mm6, eax
+        punpckldq mm6, mm6                  # mm6 = delta (0-7)
+        mov     eax, 0xFFFF                 # mask high weights to zero for PMADDWD
+        movd    mm5, eax
+        punpckldq mm5, mm5                  # mm5 = weight mask 0x0000FFFF0000FFFF
+        pand    mm5, [rdx+8]                # mm5 = weight_AB masked to 16 bits
+        movq    mm4, [rdi-8]                # preload last calculated values in mm4
+        jmp     term_17_loop
+
+        .balign  64
+term_17_loop:
+        paddd   mm4, mm4
+        psubd   mm4, [rdi-16]               # mm3 = sam_AB
+        movq    mm3, mm4
+        movq    mm1, mm3
+        paddd   mm1, mm1
+        psrld   mm4, 15
+        psrlw   mm1, 1
+        pmaddwd mm4, mm5
+        pmaddwd mm1, mm5
+        movq    mm2, [rdi]                  # mm2 = left_right
+        pslld   mm4, 5
+        paddd   mm1, mm7                    # add 512 for rounding
+        psrad   mm1, 10
+        paddd   mm4, mm2
+        paddd   mm4, mm1                    # add shifted sums
+        movq    mm0, mm3
+        movq    [rdi], mm4                  # store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
+        add     rdi, 8
+        pxor    mm1, mm1                    # mm1 = zero
+        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    # mm2 = 1s if either was zero
+        pandn   mm2, mm6                    # mask delta with zeros check
+        pxor    mm5, mm0
+        paddw   mm5, mm2                    # and add to weight_AB
+        pxor    mm5, mm0
+        cmp     rdi, rsi                    # compare bptr and eptr to see if we're done
+        jb      term_17_loop
+        jmp     term_1718_exit              # terms 17 & 18 treat samples_AB[] the same
+
+term_18_entry:
+        mov     eax, 512
+        movd    mm7, eax
+        punpckldq mm7, mm7                  # mm7 = round (512)
+        mov     rdx, [rsp]                  # set RDX to *dpp
+        mov     eax, [rdx+4]
+        movd    mm6, eax
+        punpckldq mm6, mm6                  # mm6 = delta (0-7)
+        mov     eax, 0xFFFF                 # mask high weights to zero for PMADDWD
+        movd    mm5, eax
+        punpckldq mm5, mm5                  # mm5 = weight mask 0x0000FFFF0000FFFF
+        pand    mm5, [rdx+8]                # mm5 = weight_AB masked to 16 bits
+        movq    mm4, [rdi-8]                # preload last calculated values in mm4
+        jmp     term_18_loop
+
+        .balign  64
+term_18_loop:
+        movq    mm3, mm4
+        psubd   mm3, [rdi-16]
+        psrad   mm3, 1
+        paddd   mm3, mm4                    # mm3 = sam_AB
+        movq    mm1, mm3
+        movq    mm4, mm3
+        paddd   mm1, mm1
+        psrld   mm4, 15
+        psrlw   mm1, 1
+        pmaddwd mm4, mm5
+        pmaddwd mm1, mm5
+        movq    mm2, [rdi]                  # mm2 = left_right
+        pslld   mm4, 5
+        paddd   mm1, mm7                    # add 512 for rounding
+        psrad   mm1, 10
+        paddd   mm4, mm2
+        paddd   mm4, mm1                    # add shifted sums
+        movq    mm0, mm3
+        movq    [rdi], mm4                  # store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
+        add     rdi, 8
+        pxor    mm1, mm1                    # mm1 = zero
+        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    # mm2 = 1s if either was zero
+        pandn   mm2, mm6                    # mask delta with zeros check
+        pxor    mm5, mm0
+        paddw   mm5, mm2                    # and add to weight_AB
+        pxor    mm5, mm0
+        cmp     rdi, rsi                    # compare bptr and eptr to see if we're done
+        jb      term_18_loop
+
+term_1718_exit:
+        pslld   mm5, 16                     # sign-extend 16-bit weights back to dwords
+        psrad   mm5, 16
+        mov     rdx, [rsp]                  # point to dpp
+        movq    [rdx+8], mm5                # put weight_AB back
+        emms
+
+        mov     eax, [rdi-4]                # dpp->samples_B [0] = bptr [-1];
+        mov     [rdx+48], eax
+        mov     eax, [rdi-8]                # dpp->samples_A [0] = bptr [-2];
+        mov     [rdx+16], eax
+        mov     eax, [rdi-12]               # dpp->samples_B [1] = bptr [-3];
+        mov     [rdx+52], eax
+        mov     eax, [rdi-16]               # dpp->samples_A [1] = bptr [-4];
+        mov     [rdx+20], eax
+        jmp     done
+
+#
+# registers in term -1 & -2 loops:
+#
+#   eax,ebx,edx scratch
+#   ecx         weight_A
+#   ebp         weight_B
+#   rdi         bptr
+#   rsi         eptr
+#   r8d         delta
+#
+
+term_minus_1_entry:
+        cld
+        mov     rdx, [rsp]                  # point to dpp
+        mov     ecx, [rdx+8]                # ecx = weight_A
+        mov     ebp, [rdx+12]               # ebp = weight_B
+        mov     r8d, [rdx+4]                # r8d = delta
+        mov     eax, [rdi-4]
+        jmp     term_minus_1_loop
+
+        .balign  64
+term_minus_1_loop:
+        mov     ebx, eax
+        imul    eax, ecx
+        mov     edx, [rdi]
+        jo      OV11
+        sar     eax, 10
+        adc     eax, edx
+        stosd
+        test    ebx, ebx
+        je      L182
+        test    edx, edx
+        je      L182
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ecx, ebx
+        add     ecx, r8d
+        mov     edx, 1024
+        add     edx, ebx
+        cmp     ecx, edx
+        jle     L183
+        mov     ecx, edx
+L183:   xor     ecx, ebx
+L182:   mov     ebx, eax
+        imul    eax, ebp
+        mov     edx, [rdi]
+        jo      OV12
+        sar     eax, 10
+        adc     eax, edx
+        stosd
+        test    ebx, ebx
+        je      L187
+        test    edx, edx
+        je      L187
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ebp, ebx
+        add     ebp, r8d
+        mov     edx, 1024
+        add     edx, ebx
+        cmp     ebp, edx
+        jle     L188
+        mov     ebp, edx
+L188:   xor     ebp, ebx
+L187:   cmp     rdi, rsi                    # compare bptr and eptr to see if we're done
+        jb      term_minus_1_loop
+        jmp     term_minus_1_done
+
+OV11:   mov     eax, ebx                    # restore previous sample into eax
+        jmp     long_term_minus_1_loop
+
+OV12:   mov     eax, ebx                    # restore previous sample into eax
+        jmp     L282
+
+        .balign  64
+long_term_minus_1_loop:
+        mov     ebx, eax
+        imul    ecx
+        shl     edx, 22
+        shr     eax, 10
+        adc     eax, edx
+        mov     edx, [rdi]
+        add     eax, edx
+        stosd
+        test    ebx, ebx
+        je      L282
+        test    edx, edx
+        je      L282
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ecx, ebx
+        add     ecx, r8d
+        mov     edx, 1024
+        add     edx, ebx
+        cmp     ecx, edx
+        jle     L283
+        mov     ecx, edx
+L283:   xor     ecx, ebx
+L282:   mov     ebx, eax
+        imul    ebp
+        shl     edx, 22
+        shr     eax, 10
+        adc     eax, edx
+        mov     edx, [rdi]
+        add     eax, edx
+        stosd
+        test    ebx, ebx
+        je      L287
+        test    edx, edx
+        je      L287
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ebp, ebx
+        add     ebp, r8d
+        mov     edx, 1024
+        add     edx, ebx
+        cmp     ebp, edx
+        jle     L288
+        mov     ebp, edx
+L288:   xor     ebp, ebx
+L287:   cmp     rdi, rsi                    # compare bptr and eptr to see if we're done
+        jb      long_term_minus_1_loop
+
+term_minus_1_done:
+        mov     rdx, [rsp]                  # point to dpp
+        mov     [rdx+8], ecx                # store weights back
+        mov     [rdx+12], ebp
+        mov     eax, [rdi-4]                # dpp->samples_A [0] = bptr [-1];
+        mov     [rdx+16], eax
+        jmp     done
+
+term_minus_2_entry:
+        mov     rdx, [rsp]                  # point to dpp
+        mov     ecx, [rdx+8]                # ecx = weight_A
+        mov     ebp, [rdx+12]               # ebp = weight_B
+        mov     r8d, [rdx+4]                # r8d = delta
+        mov     eax, [rdi-8]
+        jmp     term_minus_2_loop
+
+        .balign  64
+term_minus_2_loop:
+        mov     ebx, eax
+        imul    eax, ebp
+        mov     edx, [rdi+4]
+        jo      OV21
+        sar     eax, 10
+        adc     eax, edx
+        mov     [rdi+4], eax
+        test    ebx, ebx
+        je      L194
+        test    edx, edx
+        je      L194
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ebp, ebx
+        add     ebp, r8d
+        mov     edx, 1024
+        add     edx, ebx
+        cmp     ebp, edx
+        jle     L195
+        mov     ebp, edx
+L195:   xor     ebp, ebx
+L194:   mov     ebx, eax
+        imul    eax, ecx
+        mov     edx, [rdi]
+        jo      OV22
+        sar     eax, 10
+        adc     eax, edx
+        mov     [rdi], eax
+        test    ebx, ebx
+        je      L199
+        test    edx, edx
+        je      L199
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ecx, ebx
+        add     ecx, r8d
+        mov     edx, 1024
+        add     edx, ebx
+        cmp     ecx, edx
+        jle     L200
+        mov     ecx, edx
+L200:   xor     ecx, ebx
+L199:   add     rdi, 8
+        cmp     rdi, rsi                    # compare bptr and eptr to see if we're done
+        jb      term_minus_2_loop
+        jmp     term_minus_2_done
+
+OV21:   mov     eax, ebx                    # restore previous sample into eax
+        jmp     long_term_minus_2_loop
+
+OV22:   mov     eax, ebx                    # restore previous sample into eax
+        jmp     L294
+
+        .balign  64
+long_term_minus_2_loop:
+        mov     ebx, eax
+        imul    ebp
+        shl     edx, 22
+        shr     eax, 10
+        adc     eax, edx
+        mov     edx, [rdi+4]
+        add     eax, edx
+        mov     [rdi+4], eax
+        test    ebx, ebx
+        je      L294
+        test    edx, edx
+        je      L294
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ebp, ebx
+        add     ebp, r8d
+        mov     edx, 1024
+        add     edx, ebx
+        cmp     ebp, edx
+        jle     L295
+        mov     ebp, edx
+L295:   xor     ebp, ebx
+L294:   mov     ebx, eax
+        imul    ecx
+        shl     edx, 22
+        shr     eax, 10
+        adc     eax, edx
+        mov     edx, [rdi]
+        add     eax, edx
+        mov     [rdi], eax
+        test    ebx, ebx
+        je      L299
+        test    edx, edx
+        je      L299
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ecx, ebx
+        add     ecx, r8d
+        mov     edx, 1024
+        add     edx, ebx
+        cmp     ecx, edx
+        jle     L300
+        mov     ecx, edx
+L300:   xor     ecx, ebx
+L299:   add     rdi, 8
+        cmp     rdi, rsi                    # compare bptr and eptr to see if we're done
+        jb      long_term_minus_2_loop
+
+term_minus_2_done:
+        mov     rdx, [rsp]                  # point to dpp
+        mov     [rdx+8], ecx                # store weights back
+        mov     [rdx+12], ebp
+        mov     eax, [rdi-8]                # dpp->samples_B [0] = bptr [-2];
+        mov     [rdx+48], eax
+        jmp     done
+
+#
+# registers in term -3 loop:
+#
+#   rdi         bptr
+#   rsi         eptr
+#
+#   mm0, mm1    scratch
+#   mm2         original sample values
+#   mm3         correlation samples
+#   mm4         last calculated values (so we don't need to reload)
+#   mm5         weights
+#   mm6         delta
+#   mm7         512 (for rounding)
+#
+
+term_minus_3_entry:
+        mov     eax, 512
+        movd    mm7, eax
+        punpckldq mm7, mm7                  # mm7 = round (512)
+        mov     rdx, [rsp]                  # set RDX to *dpp
+        mov     eax, [rdx+4]
+        movd    mm6, eax
+        punpckldq mm6, mm6                  # mm6 = delta (0-7)
+        mov     eax, 0xFFFF                 # mask high weights to zero for PMADDWD
+        movd    mm5, eax
+        punpckldq mm5, mm5                  # mm5 = weight mask 0x0000FFFF0000FFFF
+        pand    mm5, [rdx+8]                # mm5 = weight_AB masked to 16 bits
+        movq    mm4, [rdi-8]
+        jmp     term_minus_3_loop
+
+        .balign  64
+term_minus_3_loop:
+        movq    mm3, mm4
+        psrlq   mm3, 32
+        punpckldq mm3, mm4                  # mm3 = sam_AB
+        movq    mm1, mm3
+        movq    mm4, mm3
+        pslld   mm1, 1
+        psrld   mm4, 15
+        psrlw   mm1, 1
+        pmaddwd mm4, mm5
+        pmaddwd mm1, mm5
+        movq    mm2, [rdi]                  # mm2 = left_right
+        pslld   mm4, 5
+        paddd   mm1, mm7                    # add 512 for rounding
+        psrad   mm1, 10
+        paddd   mm4, mm2
+        paddd   mm4, mm1                    # add shifted sums
+        movq    [rdi], mm4                  # store result
+        movq    mm0, mm3
+        pxor    mm0, mm2
+        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
+        add     rdi, 8
+        pxor    mm1, mm1                    # mm1 = zero
+        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    # mm2 = 1s if either was zero
+        pandn   mm2, mm6                    # mask delta with zeros check
+        pcmpeqd mm1, mm1
+        psubd   mm1, mm7
+        psubd   mm1, mm7
+        psubd   mm1, mm0
+        pxor    mm5, mm0
+        paddw   mm5, mm1
+        paddusw mm5, mm2                    # and add to weight_AB
+        psubw   mm5, mm1
+        pxor    mm5, mm0
+        cmp     rdi, rsi                    # compare bptr and eptr to see if we're done
+        jb      term_minus_3_loop
+
+        pslld   mm5, 16                     # sign-extend 16-bit weights back to dwords
+        psrad   mm5, 16
+        mov     rdx, [rsp]                  # point to dpp
+        movq    [rdx+8], mm5                # put weight_AB back
+        emms
+
+        mov     edx, [rdi-4]                # dpp->samples_A [0] = bptr [-1];
+        mov     rax, [rsp] 
+        mov     [rax+16], edx
+        mov     edx, [rdi-8]                # dpp->samples_B [0] = bptr [-2];
+        mov     [rax+48], edx
+
+done:   add     rsp, 8
+        pop     rsi
+        pop     rdi
+        pop     rbx
+        pop     rbp
+        ret
+
+#######################################################################################################################
+#
+# This is the mono version of the above function. It does not use MMX and does not handle negative terms.
+#
+# void unpack_decorr_mono_pass_cont (struct decorr_pass *dpp,
+#                                    int32_t *buffer,
+#                                    int32_t sample_count,
+#                                    int32_t long_math;
+# arguments on entry:
+#
+# System V  Windows  
+#   rdi       rcx      struct decorr_pass *dpp
+#   rsi       rdx      int32_t *buffer
+#   edx       r8       int32_t sample_count
+#   ecx       r9       int32_t long_math
+#
+# registers after entry:
+#
+#   rdi         bptr
+#   rsi         eptr
+#
+# stack usage:
+#
+# [rsp+0] = *dpp
+#
+
+_unpack_decorr_mono_pass_cont_x64win:
+unpack_decorr_mono_pass_cont_x64win:
+        push    rbp
+        push    rbx
+        push    rdi
+        push    rsi
+        sub     rsp, 8
+
+        mov     rdi, rcx                    # copy params from win regs to Linux regs
+        mov     rsi, rdx                    # so we can leave following code similar
+        mov     rdx, r8
+        mov     rcx, r9
+        jmp     mentry                      # jump into common portion
+
+_unpack_decorr_mono_pass_cont_x64:
+unpack_decorr_mono_pass_cont_x64:
+        push    rbp
+        push    rbx
+        push    rdi
+        push    rsi
+        sub     rsp, 8
+
+mentry: mov     [rsp], rdi                  # store dpp* into [rsp]
+        and     edx, edx                    # if sample_count is zero, do nothing
+        jz      mono_done
+
+        cld                                 # we use stosd
+        mov     rdi, rsi                    # rdi = bptr
+        lea     rsi, [rdi+rdx*4]            # rsi = eptr
+
+        mov     rax, [rsp]                  # get term from dpp struct & vector to handler
+        mov     eax, [rax]
+        cmp     al, 17
+        je      mono_17_entry
+        cmp     al, 18
+        je      mono_18_entry
+
+#
+# registers during default term processing loop:
+#   rdi         active buffer pointer
+#   rsi         end of buffer pointer
+#   r8d         delta
+#   ecx         weight_A
+#   ebx         term * -4
+#   eax,edx     scratch
+#
+
+default_mono_entry:
+        imul    rbx, rax, -4                # set rbx to term * -4 for decorrelation index
+        mov     rdx, [rsp]
+        mov     ecx, [rdx+8]                # ecx = weight, r8d = delta
+        mov     r8d, [rdx+4]
+        jmp     default_mono_loop
+
+#
+# registers during processing loop for terms 17 & 18:
+#   rdi         active buffer pointer
+#   rsi         end of buffer pointer
+#   r8d         delta
+#   ecx         weight_A
+#   ebp         previously calculated value
+#   ebx         calculated correlation sample
+#   eax,edx     scratch
+#
+
+mono_17_entry:
+        mov     rdx, [rsp]                  # rdx = dpp*
+        mov     ecx, [rdx+8]                # ecx = weight, r8d = delta
+        mov     r8d, [rdx+4]
+        mov     ebp, [rdi-4]
+        jmp     mono_17_loop
+
+mono_18_entry:
+        mov     rdx, [rsp]                  # rdx = dpp*
+        mov     ecx, [rdx+8]                # ecx = weight, r8d = delta
+        mov     r8d, [rdx+4]
+        mov     ebp, [rdi-4]
+        jmp     mono_18_loop
+
+        .balign  64
+default_mono_loop:
+        mov     eax, [rdi+rbx]
+        imul    eax, ecx
+        mov     edx, [rdi]
+        jo      long_default_mono_loop
+        sar     eax, 10
+        adc     eax, edx
+        mov     [rdi], eax
+        mov     eax, [rdi+rbx]
+        add     rdi, 4
+        test    edx, edx
+        je      L100
+        test    eax, eax
+        je      L100
+        xor     eax, edx
+        cdq
+        xor     ecx, edx
+        add     ecx, r8d
+        xor     ecx, edx
+L100:   cmp     rdi, rsi                    # compare bptr and eptr to see if we're done
+        jb      default_mono_loop
+        jmp     default_mono_done
+
+        .balign  64
+long_default_mono_loop:
+        mov     eax, [rdi+rbx]
+        imul    ecx
+        shl     edx, 22
+        shr     eax, 10
+        adc     eax, edx
+        mov     edx, [rdi]
+        add     eax, edx
+        mov     [rdi], eax
+        mov     eax, [rdi+rbx]
+        add     rdi, 4
+        test    edx, edx
+        je      L101
+        test    eax, eax
+        je      L101
+        xor     eax, edx
+        cdq
+        xor     ecx, edx
+        add     ecx, r8d
+        xor     ecx, edx
+L101:   cmp     rdi, rsi                    # compare bptr and eptr to see if we're done
+        jb      long_default_mono_loop
+
+default_mono_done:
+        mov     rdx, [rsp]                  # edx = dpp*
+        mov     [rdx+8], ecx                # store weight_A back
+        mov     ecx, [rdx]                  # ecx = dpp->term
+
+default_mono_store_samples:
+        dec     ecx
+        sub     rdi, 4                      # back up one full sample
+        mov     eax, [rdi]
+        mov     [rdx+rcx*4+16], eax         # store samples_A [ecx]
+        test    ecx, ecx
+        jnz     default_mono_store_samples
+        jmp     mono_done
+
+        .balign  64
+mono_17_loop:
+        lea     ebx, [ebp+ebp]
+        sub     ebx, [rdi-8]
+        mov     eax, ecx
+        imul    eax, ebx
+        mov     edx, [rdi]
+        jo      long_mono_17_loop
+        sar     eax, 10
+        adc     eax, edx
+        stosd
+        test    ebx, ebx
+        mov     ebp, eax
+        je      L117
+        test    edx, edx
+        je      L117
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ecx, ebx
+        add     ecx, r8d
+        xor     ecx, ebx
+L117:   cmp     rdi, rsi                    # compare bptr and eptr to see if we're done
+        jb      mono_17_loop
+        jmp     mono_1718_exit
+
+        .balign  64
+long_mono_17_loop:
+        lea     ebx, [ebp+ebp]
+        sub     ebx, [rdi-8]
+        mov     eax, ecx
+        imul    ebx
+        shl     edx, 22
+        shr     eax, 10
+        adc     eax, edx
+        mov     edx, [rdi]
+        add     eax, edx
+        stosd
+        test    ebx, ebx
+        mov     ebp, eax
+        je      L217
+        test    edx, edx
+        je      L217
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ecx, ebx
+        add     ecx, r8d
+        xor     ecx, ebx
+L217:   cmp     rdi, rsi                    # compare bptr and eptr to see if we're done
+        jb      long_mono_17_loop
+        jmp     mono_1718_exit
+
+        .balign  64
+mono_18_loop:
+        lea     ebx, [ebp+ebp*2]
+        sub     ebx, [rdi-8]
+        sar     ebx, 1
+        mov     eax, ecx
+        imul    eax, ebx
+        mov     edx, [rdi]
+        jo      long_mono_18_loop
+        sar     eax, 10
+        adc     eax, edx
+        stosd
+        test    ebx, ebx
+        mov     ebp, eax
+        je      L118
+        test    edx, edx
+        je      L118
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ecx, ebx
+        add     ecx, r8d
+        xor     ecx, ebx
+L118:   cmp     rdi, rsi                    # compare bptr and eptr to see if we're done
+        jb      mono_18_loop
+        jmp     mono_1718_exit
+
+        .balign  64
+long_mono_18_loop:
+        lea     ebx, [ebp+ebp*2]
+        sub     ebx, [rdi-8]
+        sar     ebx, 1
+        mov     eax, ecx
+        imul    ebx
+        shl     edx, 22
+        shr     eax, 10
+        adc     eax, edx
+        mov     edx, [rdi]
+        add     eax, edx
+        stosd
+        test    ebx, ebx
+        mov     ebp, eax
+        je      L218
+        test    edx, edx
+        je      L218
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ecx, ebx
+        add     ecx, r8d
+        xor     ecx, ebx
+L218:   cmp     rdi, rsi                    # compare bptr and eptr to see if we're done
+        jb      long_mono_18_loop
+
+mono_1718_exit:
+        mov     rdx, [rsp]                  # edx = dpp*
+        mov     [rdx+8], ecx                # store weight_A back
+        mov     eax, [rdi-4]                # dpp->samples_A [0] = bptr [-1];
+        mov     [rdx+16], eax
+        mov     eax, [rdi-8]                # dpp->samples_A [1] = bptr [-2];
+        mov     [rdx+20], eax
+
+mono_done:
+        add     rsp, 8
+        pop     rsi
+        pop     rdi
+        pop     rbx
+        pop     rbp
+        ret
+
+#ifdef __ELF__
+        .section .note.GNU-stack,"",@progbits
+#endif
+
diff --git a/third_party/wavpack/src/unpack_x64.asm b/third_party/wavpack/src/unpack_x64.asm
new file mode 100644
index 0000000..a4df18a
--- /dev/null
+++ b/third_party/wavpack/src/unpack_x64.asm
@@ -0,0 +1,930 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;                           **** WAVPACK ****                            ;;
+;;                  Hybrid Lossless Wavefile Compressor                   ;;
+;;              Copyright (c) 1998 - 2015 Conifer Software.               ;;
+;;                          All Rights Reserved.                          ;;
+;;      Distributed under the BSD Software License (see license.txt)      ;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+        include <ksamd64.inc>
+
+asmcode segment page 'CODE'
+
+; This is an assembly optimized version of the following WavPack function:
+;
+; void unpack_decorr_stereo_pass_cont (struct decorr_pass *dpp,
+;                                      int32_t *buffer,
+;                                      int32_t sample_count,
+;                                      int32_t long_math;
+;
+; It performs a single pass of stereo decorrelation on the provided buffer.
+; Note that this version of the function requires that up to 8 previous
+; stereo samples are visible and correct. In other words, it ignores the
+; "samples_*" fields in the decorr_pass structure and gets the history data
+; directly from the buffer. It does, however, return the appropriate history
+; samples to the decorr_pass structure before returning.
+;
+; The "long_math" argument is used to specify that a 32-bit multiply is
+; not enough for the "apply_weight" operation (although in this case it
+; would only apply to the -1 and -2 terms because the MMX code does not have
+; this limitation) but we ignore the parameter and use the overflow detection
+; of the "imul" instruction to switch automatically to the "long_math" loop.
+;
+; This is written to work on an X86-64 processor (also called the AMD64)
+; running in 64-bit mode and generally uses the MMX extensions to improve
+; the performance by processing both stereo channels together. Unfortunately
+; this is not easily used for terms -1 and -2, so these terms are handled
+; sequentially with regular assembler code.
+;
+; This version is for 64-bit Windows. The arguments are passed in registers:
+;
+;   rcx     struct decorr_pass *dpp
+;   rdx     int32_t *buffer
+;   r8d     int32_t sample_count
+;   r9d     int32_t long_math
+;
+; registers after entry:
+;
+;   rdi         bptr
+;   rsi         eptr
+;   ecx         long_math (only used for terms -1 and -2)
+;
+; stack usage:
+;
+; [rsp+0] = *dpp
+;
+
+unpack_decorr_stereo_pass_cont_x64win proc public frame
+        push_reg    rbp                     ; save non-volatile registers on stack
+        push_reg    rbx                     ; (alphabetically)
+        push_reg    rdi
+        push_reg    rsi
+        alloc_stack 8                       ; allocate 8 bytes on stack & align to 16 bytes
+        end_prologue
+
+        mov     [rsp], rcx                  ; [rsp] = *dpp
+        mov     rdi, rcx                    ; copy params from win regs to Linux regs
+        mov     rsi, rdx                    ; so we can leave following code similar
+        mov     rdx, r8
+        mov     rcx, r9
+
+        and     edx, edx                    ; if sample_count is zero, do nothing
+        jz      done
+
+        mov     rdi, rsi                    ; rdi = bptr
+        lea     rsi, [rdi+rdx*8]            ; rsi = eptr
+
+        mov     rax, [rsp]                  ; get term from dpp struct & vector to handler
+        mov     eax, [rax]
+        cmp     al, 17
+        je      term_17_entry
+        cmp     al, 18
+        je      term_18_entry
+        cmp     al, -1
+        je      term_minus_1_entry
+        cmp     al, -2
+        je      term_minus_2_entry
+        cmp     al, -3
+        je      term_minus_3_entry
+
+;
+; registers in default term loop:
+;
+;   rbx         term * -8 (for indexing correlation sample)
+;   rdi         bptr
+;   rsi         eptr
+;
+;   mm0, mm1    scratch
+;   mm2         original sample values
+;   mm3         correlation sample
+;   mm4         zero (for pcmpeqd)
+;   mm5         weights
+;   mm6         delta
+;   mm7         512 (for rounding)
+;
+
+default_term_entry:
+        imul    rbx, rax, -8                ; set RBX to term * -8
+        mov     eax, 512
+        movd    mm7, eax
+        punpckldq mm7, mm7                  ; mm7 = round (512)
+        mov     rdx, [rsp]                  ; set RDX to *dpp
+        mov     eax, [rdx+4]
+        movd    mm6, eax
+        punpckldq mm6, mm6                  ; mm6 = delta (0-7)
+        mov     eax, 0FFFFh                 ; mask high weights to zero for PMADDWD
+        movd    mm5, eax
+        punpckldq mm5, mm5                  ; mm5 = weight mask 0x0000FFFF0000FFFF
+        pand    mm5, [rdx+8]                ; mm5 = weight_AB masked to 16 bits
+        pxor    mm4, mm4                    ; mm4 = zero (for pcmpeqd)
+        jmp     default_term_loop
+
+        align  64
+default_term_loop:
+        movq    mm3, [rdi+rbx]              ; mm3 = sam_AB
+        movq    mm1, mm3
+        movq    mm0, mm3
+        paddd   mm1, mm1
+        psrld   mm0, 15
+        psrlw   mm1, 1
+        pmaddwd mm0, mm5
+        pmaddwd mm1, mm5
+        movq    mm2, [rdi]                  ; mm2 = left_right
+        pslld   mm0, 5
+        paddd   mm1, mm7                    ; add 512 for rounding
+        psrad   mm1, 10
+        paddd   mm0, mm2
+        paddd   mm0, mm1                    ; add shifted sums
+        movq    [rdi], mm0                  ; store result
+        movq    mm0, mm3
+        pxor    mm0, mm2
+        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
+        add     rdi, 8
+        pcmpeqd mm2, mm4                    ; mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm4                    ; mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    ; mm2 = 1s if either was zero
+        pandn   mm2, mm6                    ; mask delta with zeros check
+        pxor    mm5, mm0
+        paddw   mm5, mm2                    ; and add to weight_AB
+        pxor    mm5, mm0
+        cmp     rdi, rsi                    ; compare bptr and eptr to see if we're done
+        jb      default_term_loop
+
+        pslld   mm5, 16                     ; sign-extend 16-bit weights back to dwords
+        psrad   mm5, 16
+        mov     rdx, [rsp]                  ; point to dpp
+        movq    [rdx+8], mm5                ; put weight_AB back
+        emms
+
+        mov     ecx, [rdx]                  ; ecx = dpp->term
+
+default_store_samples:
+        dec     ecx
+        sub     rdi, 8                      ; back up one full sample
+        mov     eax, [rdi+4]
+        mov     [rdx+rcx*4+48], eax         ; store samples_B [ecx]
+        mov     eax, [rdi]
+        mov     [rdx+rcx*4+16], eax         ; store samples_A [ecx]
+        test    ecx, ecx
+        jnz     default_store_samples
+        jmp     done
+
+;
+; registers in term 17 & 18 loops:
+;
+;   rdi         bptr
+;   rsi         eptr
+;
+;   mm0, mm1    scratch
+;   mm2         original sample values
+;   mm3         correlation samples
+;   mm4         last calculated values (so we don't need to reload)
+;   mm5         weights
+;   mm6         delta
+;   mm7         512 (for rounding)
+;
+
+term_17_entry:
+        mov     eax, 512
+        movd    mm7, eax
+        punpckldq mm7, mm7                  ; mm7 = round (512)
+        mov     rdx, [rsp]                  ; set RDX to *dpp
+        mov     eax, [rdx+4]
+        movd    mm6, eax
+        punpckldq mm6, mm6                  ; mm6 = delta (0-7)
+        mov     eax, 0FFFFh                 ; mask high weights to zero for PMADDWD
+        movd    mm5, eax
+        punpckldq mm5, mm5                  ; mm5 = weight mask 0x0000FFFF0000FFFF
+        pand    mm5, [rdx+8]                ; mm5 = weight_AB masked to 16 bits
+        movq    mm4, [rdi-8]                ; preload last calculated values in mm4
+        jmp     term_17_loop
+
+        align  64
+term_17_loop:
+        paddd   mm4, mm4
+        psubd   mm4, [rdi-16]               ; mm3 = sam_AB
+        movq    mm3, mm4
+        movq    mm1, mm3
+        paddd   mm1, mm1
+        psrld   mm4, 15
+        psrlw   mm1, 1
+        pmaddwd mm4, mm5
+        pmaddwd mm1, mm5
+        movq    mm2, [rdi]                  ; mm2 = left_right
+        pslld   mm4, 5
+        paddd   mm1, mm7                    ; add 512 for rounding
+        psrad   mm1, 10
+        paddd   mm4, mm2
+        paddd   mm4, mm1                    ; add shifted sums
+        movq    mm0, mm3
+        movq    [rdi], mm4                  ; store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
+        add     rdi, 8
+        pxor    mm1, mm1                    ; mm1 = zero
+        pcmpeqd mm2, mm1                    ; mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    ; mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    ; mm2 = 1s if either was zero
+        pandn   mm2, mm6                    ; mask delta with zeros check
+        pxor    mm5, mm0
+        paddw   mm5, mm2                    ; and add to weight_AB
+        pxor    mm5, mm0
+        cmp     rdi, rsi                    ; compare bptr and eptr to see if we're done
+        jb      term_17_loop
+        jmp     term_1718_exit              ; terms 17 & 18 treat samples_AB[] the same
+
+term_18_entry:
+        mov     eax, 512
+        movd    mm7, eax
+        punpckldq mm7, mm7                  ; mm7 = round (512)
+        mov     rdx, [rsp]                  ; set RDX to *dpp
+        mov     eax, [rdx+4]
+        movd    mm6, eax
+        punpckldq mm6, mm6                  ; mm6 = delta (0-7)
+        mov     eax, 0FFFFh                 ; mask high weights to zero for PMADDWD
+        movd    mm5, eax
+        punpckldq mm5, mm5                  ; mm5 = weight mask 0x0000FFFF0000FFFF
+        pand    mm5, [rdx+8]                ; mm5 = weight_AB masked to 16 bits
+        movq    mm4, [rdi-8]                ; preload last calculated values in mm4
+        jmp     term_18_loop
+
+        align  64
+term_18_loop:
+        movq    mm3, mm4
+        psubd   mm3, [rdi-16]
+        psrad   mm3, 1
+        paddd   mm3, mm4                    ; mm3 = sam_AB
+        movq    mm1, mm3
+        movq    mm4, mm3
+        paddd   mm1, mm1
+        psrld   mm4, 15
+        psrlw   mm1, 1
+        pmaddwd mm4, mm5
+        pmaddwd mm1, mm5
+        movq    mm2, [rdi]                  ; mm2 = left_right
+        pslld   mm4, 5
+        paddd   mm1, mm7                    ; add 512 for rounding
+        psrad   mm1, 10
+        paddd   mm4, mm2
+        paddd   mm4, mm1                    ; add shifted sums
+        movq    mm0, mm3
+        movq    [rdi], mm4                  ; store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
+        add     rdi, 8
+        pxor    mm1, mm1                    ; mm1 = zero
+        pcmpeqd mm2, mm1                    ; mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    ; mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    ; mm2 = 1s if either was zero
+        pandn   mm2, mm6                    ; mask delta with zeros check
+        pxor    mm5, mm0
+        paddw   mm5, mm2                    ; and add to weight_AB
+        pxor    mm5, mm0
+        cmp     rdi, rsi                    ; compare bptr and eptr to see if we're done
+        jb      term_18_loop
+
+term_1718_exit:
+        pslld   mm5, 16                     ; sign-extend 16-bit weights back to dwords
+        psrad   mm5, 16
+        mov     rdx, [rsp]                  ; point to dpp
+        movq    [rdx+8], mm5                ; put weight_AB back
+        emms
+
+        mov     eax, [rdi-4]                ; dpp->samples_B [0] = bptr [-1];
+        mov     [rdx+48], eax
+        mov     eax, [rdi-8]                ; dpp->samples_A [0] = bptr [-2];
+        mov     [rdx+16], eax
+        mov     eax, [rdi-12]               ; dpp->samples_B [1] = bptr [-3];
+        mov     [rdx+52], eax
+        mov     eax, [rdi-16]               ; dpp->samples_A [1] = bptr [-4];
+        mov     [rdx+20], eax
+        jmp     done
+
+;
+; registers in term -1 & -2 loops:
+;
+;   eax,ebx,edx scratch
+;   ecx         weight_A
+;   ebp         weight_B
+;   rdi         bptr
+;   rsi         eptr
+;   r8d         delta
+;
+
+term_minus_1_entry:
+        cld
+        mov     rdx, [rsp]                  ; point to dpp
+        mov     ecx, [rdx+8]                ; ecx = weight_A
+        mov     ebp, [rdx+12]               ; ebp = weight_B
+        mov     r8d, [rdx+4]                ; r8d = delta
+        mov     eax, [rdi-4]
+        jmp     term_minus_1_loop
+
+        align  64
+term_minus_1_loop:
+        mov     ebx, eax
+        imul    eax, ecx
+        mov     edx, [rdi]
+        jo      OV11
+        sar     eax, 10
+        adc     eax, edx
+        stosd
+        test    ebx, ebx
+        je      L182
+        test    edx, edx
+        je      L182
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ecx, ebx
+        add     ecx, r8d
+        mov     edx, 1024
+        add     edx, ebx
+        cmp     ecx, edx
+        jle     L183
+        mov     ecx, edx
+L183:   xor     ecx, ebx
+L182:   mov     ebx, eax
+        imul    eax, ebp
+        mov     edx, [rdi]
+        jo      OV12
+        sar     eax, 10
+        adc     eax, edx
+        stosd
+        test    ebx, ebx
+        je      L187
+        test    edx, edx
+        je      L187
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ebp, ebx
+        add     ebp, r8d
+        mov     edx, 1024
+        add     edx, ebx
+        cmp     ebp, edx
+        jle     L188
+        mov     ebp, edx
+L188:   xor     ebp, ebx
+L187:   cmp     rdi, rsi                    ; compare bptr and eptr to see if we're done
+        jb      term_minus_1_loop
+        jmp     term_minus_1_done
+
+OV11:   mov     eax, ebx                    ; restore previous sample into eax
+        jmp     long_term_minus_1_loop
+
+OV12:   mov     eax, ebx                    ; restore previous sample into eax
+        jmp     L282
+
+        align  64
+long_term_minus_1_loop:
+        mov     ebx, eax
+        imul    ecx
+        shl     edx, 22
+        shr     eax, 10
+        adc     eax, edx
+        mov     edx, [rdi]
+        add     eax, edx
+        stosd
+        test    ebx, ebx
+        je      L282
+        test    edx, edx
+        je      L282
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ecx, ebx
+        add     ecx, r8d
+        mov     edx, 1024
+        add     edx, ebx
+        cmp     ecx, edx
+        jle     L283
+        mov     ecx, edx
+L283:   xor     ecx, ebx
+L282:   mov     ebx, eax
+        imul    ebp
+        shl     edx, 22
+        shr     eax, 10
+        adc     eax, edx
+        mov     edx, [rdi]
+        add     eax, edx
+        stosd
+        test    ebx, ebx
+        je      L287
+        test    edx, edx
+        je      L287
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ebp, ebx
+        add     ebp, r8d
+        mov     edx, 1024
+        add     edx, ebx
+        cmp     ebp, edx
+        jle     L288
+        mov     ebp, edx
+L288:   xor     ebp, ebx
+L287:   cmp     rdi, rsi                    ; compare bptr and eptr to see if we're done
+        jb      long_term_minus_1_loop
+
+term_minus_1_done:
+        mov     rdx, [rsp]                  ; point to dpp
+        mov     [rdx+8], ecx                ; store weights back
+        mov     [rdx+12], ebp
+        mov     eax, [rdi-4]                ; dpp->samples_A [0] = bptr [-1];
+        mov     [rdx+16], eax
+        jmp     done
+
+term_minus_2_entry:
+        mov     rdx, [rsp]                  ; point to dpp
+        mov     ecx, [rdx+8]                ; ecx = weight_A
+        mov     ebp, [rdx+12]               ; ebp = weight_B
+        mov     r8d, [rdx+4]                ; r8d = delta
+        mov     eax, [rdi-8]
+        jmp     term_minus_2_loop
+
+        align  64
+term_minus_2_loop:
+        mov     ebx, eax
+        imul    eax, ebp
+        mov     edx, [rdi+4]
+        jo      OV21
+        sar     eax, 10
+        adc     eax, edx
+        mov     [rdi+4], eax
+        test    ebx, ebx
+        je      L194
+        test    edx, edx
+        je      L194
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ebp, ebx
+        add     ebp, r8d
+        mov     edx, 1024
+        add     edx, ebx
+        cmp     ebp, edx
+        jle     L195
+        mov     ebp, edx
+L195:   xor     ebp, ebx
+L194:   mov     ebx, eax
+        imul    eax, ecx
+        mov     edx, [rdi]
+        jo      OV22
+        sar     eax, 10
+        adc     eax, edx
+        mov     [rdi], eax
+        test    ebx, ebx
+        je      L199
+        test    edx, edx
+        je      L199
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ecx, ebx
+        add     ecx, r8d
+        mov     edx, 1024
+        add     edx, ebx
+        cmp     ecx, edx
+        jle     L200
+        mov     ecx, edx
+L200:   xor     ecx, ebx
+L199:   add     rdi, 8
+        cmp     rdi, rsi                    ; compare bptr and eptr to see if we're done
+        jb      term_minus_2_loop
+        jmp     term_minus_2_done
+
+OV21:   mov     eax, ebx                    ; restore previous sample into eax
+        jmp     long_term_minus_2_loop
+
+OV22:   mov     eax, ebx                    ; restore previous sample into eax
+        jmp     L294
+
+        align  64
+long_term_minus_2_loop:
+        mov     ebx, eax
+        imul    ebp
+        shl     edx, 22
+        shr     eax, 10
+        adc     eax, edx
+        mov     edx, [rdi+4]
+        add     eax, edx
+        mov     [rdi+4], eax
+        test    ebx, ebx
+        je      L294
+        test    edx, edx
+        je      L294
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ebp, ebx
+        add     ebp, r8d
+        mov     edx, 1024
+        add     edx, ebx
+        cmp     ebp, edx
+        jle     L295
+        mov     ebp, edx
+L295:   xor     ebp, ebx
+L294:   mov     ebx, eax
+        imul    ecx
+        shl     edx, 22
+        shr     eax, 10
+        adc     eax, edx
+        mov     edx, [rdi]
+        add     eax, edx
+        mov     [rdi], eax
+        test    ebx, ebx
+        je      L299
+        test    edx, edx
+        je      L299
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ecx, ebx
+        add     ecx, r8d
+        mov     edx, 1024
+        add     edx, ebx
+        cmp     ecx, edx
+        jle     L300
+        mov     ecx, edx
+L300:   xor     ecx, ebx
+L299:   add     rdi, 8
+        cmp     rdi, rsi                    ; compare bptr and eptr to see if we're done
+        jb      long_term_minus_2_loop
+
+term_minus_2_done:
+        mov     rdx, [rsp]                  ; point to dpp
+        mov     [rdx+8], ecx                ; store weights back
+        mov     [rdx+12], ebp
+        mov     eax, [rdi-8]                ; dpp->samples_B [0] = bptr [-2];
+        mov     [rdx+48], eax
+        jmp     done
+
+;
+; registers in term -3 loop:
+;
+;   rdi         bptr
+;   rsi         eptr
+;
+;   mm0, mm1    scratch
+;   mm2         original sample values
+;   mm3         correlation samples
+;   mm4         last calculated values (so we don't need to reload)
+;   mm5         weights
+;   mm6         delta
+;   mm7         512 (for rounding)
+;
+
+term_minus_3_entry:
+        mov     eax, 512
+        movd    mm7, eax
+        punpckldq mm7, mm7                  ; mm7 = round (512)
+        mov     rdx, [rsp]                  ; set RDX to *dpp
+        mov     eax, [rdx+4]
+        movd    mm6, eax
+        punpckldq mm6, mm6                  ; mm6 = delta (0-7)
+        mov     eax, 0FFFFh                 ; mask high weights to zero for PMADDWD
+        movd    mm5, eax
+        punpckldq mm5, mm5                  ; mm5 = weight mask 0x0000FFFF0000FFFF
+        pand    mm5, [rdx+8]                ; mm5 = weight_AB masked to 16 bits
+        movq    mm4, [rdi-8]
+        jmp     term_minus_3_loop
+
+        align  64
+term_minus_3_loop:
+        movq    mm3, mm4
+        psrlq   mm3, 32
+        punpckldq mm3, mm4                  ; mm3 = sam_AB
+        movq    mm1, mm3
+        movq    mm4, mm3
+        pslld   mm1, 1
+        psrld   mm4, 15
+        psrlw   mm1, 1
+        pmaddwd mm4, mm5
+        pmaddwd mm1, mm5
+        movq    mm2, [rdi]                  ; mm2 = left_right
+        pslld   mm4, 5
+        paddd   mm1, mm7                    ; add 512 for rounding
+        psrad   mm1, 10
+        paddd   mm4, mm2
+        paddd   mm4, mm1                    ; add shifted sums
+        movq    [rdi], mm4                  ; store result
+        movq    mm0, mm3
+        pxor    mm0, mm2
+        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
+        add     rdi, 8
+        pxor    mm1, mm1                    ; mm1 = zero
+        pcmpeqd mm2, mm1                    ; mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    ; mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    ; mm2 = 1s if either was zero
+        pandn   mm2, mm6                    ; mask delta with zeros check
+        pcmpeqd mm1, mm1
+        psubd   mm1, mm7
+        psubd   mm1, mm7
+        psubd   mm1, mm0
+        pxor    mm5, mm0
+        paddw   mm5, mm1
+        paddusw mm5, mm2                    ; and add to weight_AB
+        psubw   mm5, mm1
+        pxor    mm5, mm0
+        cmp     rdi, rsi                    ; compare bptr and eptr to see if we're done
+        jb      term_minus_3_loop
+
+        pslld   mm5, 16                     ; sign-extend 16-bit weights back to dwords
+        psrad   mm5, 16
+        mov     rdx, [rsp]                  ; point to dpp
+        movq    [rdx+8], mm5                ; put weight_AB back
+        emms
+
+        mov     edx, [rdi-4]                ; dpp->samples_A [0] = bptr [-1];
+        mov     rax, [rsp] 
+        mov     [rax+16], edx
+        mov     edx, [rdi-8]                ; dpp->samples_B [0] = bptr [-2];
+        mov     [rax+48], edx
+
+done:   add     rsp, 8                      ; begin epilog by deallocating stack
+        pop     rsi                         ; restore non-volatile registers & return
+        pop     rdi
+        pop     rbx
+        pop     rbp
+        ret
+
+unpack_decorr_stereo_pass_cont_x64win endp
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; This is the mono version of the above function. It does not use MMX and does not
+; handle negative terms (since they don't apply to mono), but is otherwise similar.
+;
+; void unpack_decorr_mono_pass_cont (struct decorr_pass *dpp,
+;                                    int32_t *buffer,
+;                                    int32_t sample_count,
+;                                    int32_t long_math;
+; arguments on entry:
+;
+;   rcx     struct decorr_pass *dpp
+;   rdx     int32_t *buffer
+;   r8d     int32_t sample_count
+;   r9d     int32_t long_math
+;
+; registers after entry:
+;
+;   rdi         bptr
+;   rsi         eptr
+;   ecx         long_math
+;
+; stack usage:
+;
+; [rsp+0] = *dpp
+;
+
+unpack_decorr_mono_pass_cont_x64win proc public frame
+        push_reg    rbp                     ; save non-volatile registers on stack
+        push_reg    rbx                     ; (alphabetically)
+        push_reg    rdi
+        push_reg    rsi
+        alloc_stack 8                       ; allocate 8 bytes on stack & align to 16 bytes
+        end_prologue
+
+        mov     [rsp], rcx                  ; [rsp] = *dpp
+        mov     rdi, rcx                    ; copy params from win regs to Linux regs
+        mov     rsi, rdx                    ; so we can leave following code similar
+        mov     rdx, r8
+        mov     rcx, r9
+
+        and     edx, edx                    ; if sample_count is zero, do nothing
+        jz      mono_done
+
+        cld
+        mov     rdi, rsi                    ; rdi = bptr
+        lea     rsi, [rdi+rdx*4]            ; rsi = eptr
+
+        mov     rax, [rsp]                  ; get term from dpp struct & vector to handler
+        mov     eax, [rax]
+        cmp     al, 17
+        je      mono_17_entry
+        cmp     al, 18
+        je      mono_18_entry
+
+;
+; registers during default term processing loop:
+;   rdi         active buffer pointer
+;   rsi         end of buffer pointer
+;   r8d         delta
+;   ecx         weight_A
+;   ebx         term * -4
+;   eax,edx     scratch
+;
+
+default_mono_entry:
+        imul    rbx, rax, -4                ; set rbx to term * -4 for decorrelation index
+        mov     rdx, [rsp]
+        mov     ecx, [rdx+8]                ; ecx = weight, r8d = delta
+        mov     r8d, [rdx+4]
+        jmp     default_mono_loop
+
+;
+; registers during processing loop for terms 17 & 18:
+;   rdi         active buffer pointer
+;   rsi         end of buffer pointer
+;   r8d         delta
+;   ecx         weight_A
+;   ebp         previously calculated value
+;   ebx         calculated correlation sample
+;   eax,edx     scratch
+;
+
+mono_17_entry:
+        mov     rdx, [rsp]                  ; rdx = dpp*
+        mov     ecx, [rdx+8]                ; ecx = weight, r8d = delta
+        mov     r8d, [rdx+4]
+        mov     ebp, [rdi-4]
+        jmp     mono_17_loop
+
+mono_18_entry:
+        mov     rdx, [rsp]                  ; rdx = dpp*
+        mov     ecx, [rdx+8]                ; ecx = weight, r8d = delta
+        mov     r8d, [rdx+4]
+        mov     ebp, [rdi-4]
+        jmp     mono_18_loop
+
+        align  64
+default_mono_loop:
+        mov     eax, [rdi+rbx]
+        imul    eax, ecx
+        mov     edx, [rdi]
+        jo      long_default_mono_loop
+        sar     eax, 10
+        adc     eax, edx
+        mov     [rdi], eax
+        mov     eax, [rdi+rbx]
+        add     rdi, 4
+        test    edx, edx
+        je      L100
+        test    eax, eax
+        je      L100
+        xor     eax, edx
+        cdq
+        xor     ecx, edx
+        add     ecx, r8d
+        xor     ecx, edx
+L100:   cmp     rdi, rsi                    ; compare bptr and eptr to see if we're done
+        jb      default_mono_loop
+        jmp     default_mono_done
+
+        align  64
+long_default_mono_loop:
+        mov     eax, [rdi+rbx]
+        imul    ecx
+        shl     edx, 22
+        shr     eax, 10
+        adc     eax, edx
+        mov     edx, [rdi]
+        add     eax, edx
+        mov     [rdi], eax
+        mov     eax, [rdi+rbx]
+        add     rdi, 4
+        test    edx, edx
+        je      L101
+        test    eax, eax
+        je      L101
+        xor     eax, edx
+        cdq
+        xor     ecx, edx
+        add     ecx, r8d
+        xor     ecx, edx
+L101:   cmp     rdi, rsi                    ; compare bptr and eptr to see if we're done
+        jb      long_default_mono_loop
+
+default_mono_done:
+        mov     rdx, [rsp]                  ; edx = dpp*
+        mov     [rdx+8], ecx                ; store weight_A back
+        mov     ecx, [rdx]                  ; ecx = dpp->term
+
+default_mono_store_samples:
+        dec     ecx
+        sub     rdi, 4                      ; back up one full sample
+        mov     eax, [rdi]
+        mov     [rdx+rcx*4+16], eax         ; store samples_A [ecx]
+        test    ecx, ecx
+        jnz     default_mono_store_samples
+        jmp     mono_done
+
+        align  64
+mono_17_loop:
+        lea     ebx, [ebp+ebp]
+        sub     ebx, [rdi-8]
+        mov     eax, ecx
+        imul    eax, ebx
+        mov     edx, [rdi]
+        jo      long_mono_17_loop
+        sar     eax, 10
+        adc     eax, edx
+        stosd
+        test    ebx, ebx
+        mov     ebp, eax
+        je      L117
+        test    edx, edx
+        je      L117
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ecx, ebx
+        add     ecx, r8d
+        xor     ecx, ebx
+L117:   cmp     rdi, rsi                    ; compare bptr and eptr to see if we're done
+        jb      mono_17_loop
+        jmp     mono_1718_exit
+
+        align  64
+long_mono_17_loop:
+        lea     ebx, [ebp+ebp]
+        sub     ebx, [rdi-8]
+        mov     eax, ecx
+        imul    ebx
+        shl     edx, 22
+        shr     eax, 10
+        adc     eax, edx
+        mov     edx, [rdi]
+        add     eax, edx
+        stosd
+        test    ebx, ebx
+        mov     ebp, eax
+        je      L217
+        test    edx, edx
+        je      L217
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ecx, ebx
+        add     ecx, r8d
+        xor     ecx, ebx
+L217:   cmp     rdi, rsi                    ; compare bptr and eptr to see if we're done
+        jb      long_mono_17_loop
+        jmp     mono_1718_exit
+
+        align  64
+mono_18_loop:
+        lea     ebx, [ebp+ebp*2]
+        sub     ebx, [rdi-8]
+        sar     ebx, 1
+        mov     eax, ecx
+        imul    eax, ebx
+        mov     edx, [rdi]
+        jo      long_mono_18_loop
+        sar     eax, 10
+        adc     eax, edx
+        stosd
+        test    ebx, ebx
+        mov     ebp, eax
+        je      L118
+        test    edx, edx
+        je      L118
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ecx, ebx
+        add     ecx, r8d
+        xor     ecx, ebx
+L118:   cmp     rdi, rsi                    ; compare bptr and eptr to see if we're done
+        jb      mono_18_loop
+        jmp     mono_1718_exit
+
+        align  64
+long_mono_18_loop:
+        lea     ebx, [ebp+ebp*2]
+        sub     ebx, [rdi-8]
+        sar     ebx, 1
+        mov     eax, ecx
+        imul    ebx
+        shl     edx, 22
+        shr     eax, 10
+        adc     eax, edx
+        mov     edx, [rdi]
+        add     eax, edx
+        stosd
+        test    ebx, ebx
+        mov     ebp, eax
+        je      L218
+        test    edx, edx
+        je      L218
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ecx, ebx
+        add     ecx, r8d
+        xor     ecx, ebx
+L218:   cmp     rdi, rsi                    ; compare bptr and eptr to see if we're done
+        jb      long_mono_18_loop
+
+mono_1718_exit:
+        mov     rdx, [rsp]                  ; edx = dpp*
+        mov     [rdx+8], ecx                ; store weight_A back
+        mov     eax, [rdi-4]                ; dpp->samples_A [0] = bptr [-1];
+        mov     [rdx+16], eax
+        mov     eax, [rdi-8]                ; dpp->samples_A [1] = bptr [-2];
+        mov     [rdx+20], eax
+
+mono_done:
+        add     rsp, 8                      ; begin epilog by deallocating stack
+        pop     rsi                         ; restore non-volatile registers & return
+        pop     rdi
+        pop     rbx
+        pop     rbp
+        ret
+
+unpack_decorr_mono_pass_cont_x64win endp
+
+asmcode ends
+
+        end
+
+
diff --git a/third_party/wavpack/src/unpack_x86.S b/third_party/wavpack/src/unpack_x86.S
new file mode 100644
index 0000000..104515b
--- /dev/null
+++ b/third_party/wavpack/src/unpack_x86.S
@@ -0,0 +1,970 @@
+############################################################################
+##                           **** WAVPACK ****                            ##
+##                  Hybrid Lossless Wavefile Compressor                   ##
+##              Copyright (c) 1998 - 2015 Conifer Software.               ##
+##                          All Rights Reserved.                          ##
+##      Distributed under the BSD Software License (see license.txt)      ##
+############################################################################
+
+        .intel_syntax noprefix
+        .text
+
+        .globl  _unpack_decorr_stereo_pass_cont_x86
+        .globl  _unpack_decorr_mono_pass_cont_x86
+        .globl  _unpack_cpu_has_feature_x86
+
+        .globl  unpack_decorr_stereo_pass_cont_x86
+        .globl  unpack_decorr_mono_pass_cont_x86
+        .globl  unpack_cpu_has_feature_x86
+
+
+# This module contains X86 assembly optimized versions of functions required
+# to decode WavPack files. Note that the stereo versions of these functions
+# use the MMX registers and instructions of the X86 processor, and so a
+# helper function is provided to make a runtime check for that feature.
+
+# This is an assembly optimized version of the following WavPack function:
+#
+# void unpack_decorr_stereo_pass_cont (struct decorr_pass *dpp,
+#                                      int32_t *buffer,
+#                                      int32_t sample_count,
+#                                      int32_t long_math;
+#
+# It performs a single pass of stereo decorrelation on the provided buffer.
+# Note that this version of the function requires that up to 8 previous
+# stereo samples are visible and correct. In other words, it ignores the
+# "samples_*" fields in the decorr_pass structure and gets the history data
+# directly from the buffer. It does, however, return the appropriate history
+# samples to the decorr_pass structure before returning.
+#
+# The "long_math" argument is used to specify that a 32-bit multiply is
+# not enough for the "apply_weight" operation (although in this case it
+# would only apply to the -1 and -2 terms because the MMX code does not have
+# this limitation) but we ignore the parameter and use the overflow detection
+# of the "imul" instruction to switch automatically to the "long_math" loop.
+#
+# This is written to work on an IA-32 processor and uses the MMX extensions
+# to improve the performance by processing both stereo channels together.
+# For terms -1 and -2 the MMX extensions are not usable, and so these are
+# performed independently without them.
+#
+# arguments on entry:
+#
+#   struct decorr_pass *dpp     [ebp+8]
+#   int32_t *buffer             [ebp+12]
+#   int32_t sample_count        [ebp+16]
+#   int32_t long_math           [ebp+20]
+#
+# registers after entry:
+#
+#   rdi         bptr
+#   rsi         eptr
+#
+# on stack (used for terms -1 and -2 only):
+# 
+#   int32_t delta             DWORD [esp]
+#
+
+_unpack_decorr_stereo_pass_cont_x86:
+unpack_decorr_stereo_pass_cont_x86:
+        push    ebp
+        mov     ebp, esp
+        push    ebx
+        push    esi
+        push    edi
+
+        mov     edx, [ebp+8]                # copy delta from dpp to top of stack
+        mov     eax, [edx+4]
+        push    eax
+
+        mov     edi, [ebp+12]               # edi = buffer
+        mov     eax, [ebp+16]               # get sample_count and divide by 8
+        shl     eax, 3
+        jz      done                        # exit now if there's nothing to do
+
+        add     eax, edi                    # else add to buffer point to make eptr
+        mov     esi, eax
+    
+        mov     eax, [ebp+8]                # get term from dpp and vector appropriately
+        mov     eax, [eax]
+        cmp     eax, 17
+        je      term_17_entry
+        cmp     eax, 18
+        je      term_18_entry
+        cmp     eax, -1
+        je      term_minus_1_entry
+        cmp     eax, -2
+        je      term_minus_2_entry
+        cmp     eax, -3
+        je      term_minus_3_entry
+
+#
+# registers during default term processing loop:
+#   edi         active buffer pointer
+#   esi         end of buffer pointer
+#
+# MMX:
+#   mm0, mm1    scratch
+#   mm2         original sample values
+#   mm3         correlation samples
+#   mm4         zero (for pcmpeqd)
+#   mm5         weights
+#   mm6         delta
+#   mm7         512 (for rounding)
+#
+
+default_term_entry:
+        imul    ebx, eax, -8                # set ebx to term * -8 for decorrelation index
+        mov     eax, 512
+        movd    mm7, eax
+        punpckldq mm7, mm7                  # mm7 = round (512)
+        mov     edx, [ebp+8]                # edx = *dpp
+        mov     eax, [edx+4]
+        movd    mm6, eax
+        punpckldq mm6, mm6                  # mm6 = delta (0-7)
+        mov     eax, 0xFFFF                 # mask high weights to zero for PMADDWD
+        movd    mm5, eax
+        punpckldq mm5, mm5                  # mm5 = weight mask 0x0000FFFF0000FFFF
+        pand    mm5, [edx+8]                # mm5 = weight_AB masked to 16 bits
+        pxor    mm4, mm4                    # mm4 = zero (for pcmpeqd)
+        jmp     default_term_loop
+
+        .balign  64
+default_term_loop:
+        movq    mm3, [edi+ebx]              # mm3 = sam_AB
+        movq    mm1, mm3
+        movq    mm0, mm3
+        paddd   mm1, mm1
+        psrld   mm0, 15
+        psrlw   mm1, 1
+        pmaddwd mm0, mm5
+        pmaddwd mm1, mm5
+        movq    mm2, [edi]                  # mm2 = left_right
+        pslld   mm0, 5
+        paddd   mm1, mm7                    # add 512 for rounding
+        psrad   mm1, 10
+        paddd   mm0, mm2
+        paddd   mm0, mm1                    # add shifted sums
+        movq    [edi], mm0                  # store result
+        movq    mm0, mm3
+        pxor    mm0, mm2
+        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
+        add     edi, 8
+        pcmpeqd mm2, mm4                    # mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm4                    # mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    # mm2 = 1s if either was zero
+        pandn   mm2, mm6                    # mask delta with zeros check
+        pxor    mm5, mm0
+        paddw   mm5, mm2                    # and add to weight_AB
+        pxor    mm5, mm0
+        cmp     edi, esi                    # compare bptr and eptr to see if we're done
+        jb      default_term_loop
+
+        pslld   mm5, 16                     # sign-extend 16-bit weights back to dwords
+        psrad   mm5, 16
+        mov     eax, [ebp+8]                # point to dpp
+        movq    [eax+8], mm5                # put weight_AB back
+        emms
+        mov     edx, [ebp+8]                # access dpp with edx
+        mov     ecx, [edx]                  # ecx = dpp->term
+
+default_store_samples:
+        dec     ecx
+        sub     edi, 8                      # back up one full sample
+        mov     eax, [edi+4]
+        mov     [edx+ecx*4+48], eax         # store samples_B [ecx]
+        mov     eax, [edi]
+        mov     [edx+ecx*4+16], eax         # store samples_A [ecx]
+        test    ecx, ecx
+        jnz     default_store_samples
+
+        jmp     done
+
+#
+# registers during processing loop for terms 17 & 18:
+#   edi         active buffer pointer
+#   esi         end of buffer pointer
+#
+# MMX:
+#   mm0, mm1    scratch
+#   mm2         original sample values
+#   mm3         calculated correlation samples
+#   mm4         last calculated values (so we don't need to reload)
+#   mm5         weights
+#   mm6         delta
+#   mm7         512 (for rounding)
+#
+
+term_17_entry:
+        mov     eax, 512
+        movd    mm7, eax
+        punpckldq mm7, mm7                  # mm7 = round (512)
+        mov     edx, [ebp+8]                # point to dpp & get delta
+        mov     eax, [edx+4]
+        movd    mm6, eax
+        punpckldq mm6, mm6                  # mm6 = delta (0-7)
+        mov     eax, 0xFFFF                 # mask high weights to zero for PMADDWD
+        movd    mm5, eax
+        punpckldq mm5, mm5                  # mm5 = weight mask 0x0000FFFF0000FFFF
+        pand    mm5, [edx+8]                # mm5 = weight_AB masked to 16 bits
+        movq    mm4, [edi-8]                # preload previous calculated values
+        jmp     term_17_loop
+
+        .balign  64
+term_17_loop:
+        paddd   mm4, mm4
+        psubd   mm4, [edi-16]               # mm3 = sam_AB
+        movq    mm3, mm4
+        movq    mm1, mm3
+        paddd   mm1, mm1
+        psrld   mm4, 15
+        psrlw   mm1, 1
+        pmaddwd mm4, mm5
+        pmaddwd mm1, mm5
+        movq    mm2, [edi]                  # mm2 = left_right
+        pslld   mm4, 5
+        paddd   mm1, mm7                    # add 512 for rounding
+        psrad   mm1, 10
+        paddd   mm4, mm2
+        paddd   mm4, mm1                    # add shifted sums
+        movq    mm0, mm3
+        movq    [edi], mm4                  # store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
+        add     edi, 8
+        pxor    mm1, mm1                    # mm1 = zero
+        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    # mm2 = 1s if either was zero
+        pandn   mm2, mm6                    # mask delta with zeros check
+        pxor    mm5, mm0
+        paddw   mm5, mm2                    # and add to weight_AB
+        pxor    mm5, mm0
+        cmp     edi, esi                    # compare bptr and eptr to see if we're done
+        jb      term_17_loop
+
+        pslld   mm5, 16                     # sign-extend 16-bit weights back to dwords
+        psrad   mm5, 16
+        mov     eax, [ebp+8]                # point to dpp
+        movq    [eax+8], mm5                # put weight_AB back
+        emms
+        jmp     term_1718_exit
+
+term_18_entry:
+        mov     eax, 512
+        movd    mm7, eax
+        punpckldq mm7, mm7                  # mm7 = round (512)
+        mov     edx, [ebp+8]                # point to dpp & get delta
+        mov     eax, [edx+4]
+        movd    mm6, eax
+        punpckldq mm6, mm6                  # mm6 = delta (0-7)
+        mov     eax, 0xFFFF                 # mask high weights to zero for PMADDWD
+        movd    mm5, eax
+        punpckldq mm5, mm5                  # mm5 = weight mask 0x0000FFFF0000FFFF
+        pand    mm5, [edx+8]                # mm5 = weight_AB masked to 16 bits
+        movq    mm4, [edi-8]                # preload previous calculated value
+        jmp     term_18_loop
+
+        .balign  64
+term_18_loop:
+        movq    mm3, mm4
+        psubd   mm3, [edi-16]
+        psrad   mm3, 1
+        paddd   mm3, mm4                    # mm3 = sam_AB
+        movq    mm1, mm3
+        movq    mm4, mm3
+        paddd   mm1, mm1
+        psrld   mm4, 15
+        psrlw   mm1, 1
+        pmaddwd mm4, mm5
+        pmaddwd mm1, mm5
+        movq    mm2, [edi]                  # mm2 = left_right
+        pslld   mm4, 5
+        paddd   mm1, mm7                    # add 512 for rounding
+        psrad   mm1, 10
+        paddd   mm4, mm2
+        paddd   mm4, mm1                    # add shifted sums
+        movq    mm0, mm3
+        movq    [edi], mm4                  # store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
+        add     edi, 8
+        pxor    mm1, mm1                    # mm1 = zero
+        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    # mm2 = 1s if either was zero
+        pandn   mm2, mm6                    # mask delta with zeros check
+        pxor    mm5, mm0
+        paddw   mm5, mm2                    # and add to weight_AB
+        pxor    mm5, mm0
+        cmp     edi, esi                    # compare bptr and eptr to see if we're done
+        jb      term_18_loop
+
+        pslld   mm5, 16                     # sign-extend 16-bit weights back to dwords
+        psrad   mm5, 16
+        mov     eax, [ebp+8]                # point to dpp
+        movq    [eax+8], mm5                # put weight_AB back
+        emms
+
+term_1718_exit:
+        mov     edx, [edi-4]                # dpp->samples_B [0] = bptr [-1];
+        mov     eax, [ebp+8]
+        mov     [eax+48], edx
+        mov     edx, [edi-8]                # dpp->samples_A [0] = bptr [-2];
+        mov     [eax+16], edx
+        mov     edx, [edi-12]               # dpp->samples_B [1] = bptr [-3];
+        mov     [eax+52], edx
+        mov     edx, [edi-16]               # dpp->samples_A [1] = bptr [-4];
+        mov     [eax+20], edx
+        jmp     done
+
+#
+# registers in term -1 & -2 loops:
+#
+#   eax,ebx,edx scratch
+#   ecx         weight_A
+#   ebp         weight_B
+#   edi         bptr
+#   esi         eptr
+#
+
+term_minus_1_entry:
+        cld                                 # we use stosd here...
+        mov     eax, [ebp+8]                # point to dpp
+        mov     ecx, [eax+8]                # ecx = weight_A and ebp = weight_B
+        mov     ebp, [eax+12]
+        mov     eax, [edi-4]
+        jmp     term_minus_1_loop
+
+        .balign  64
+term_minus_1_loop:
+        mov     ebx, eax
+        imul    eax, ecx
+        mov     edx, [edi]
+        jo      OV11
+        sar     eax, 10
+        adc     eax, edx
+        stosd
+        test    ebx, ebx
+        je      L182
+        test    edx, edx
+        je      L182
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ecx, ebx
+        add     ecx, [esp]
+        mov     edx, 1024
+        add     edx, ebx
+        cmp     ecx, edx
+        jle     L183
+        mov     ecx, edx
+L183:   xor     ecx, ebx
+L182:   mov     ebx, eax
+        imul    eax, ebp
+        mov     edx, [edi]
+        jo      OV12
+        sar     eax, 10
+        adc     eax, edx
+        stosd
+        test    ebx, ebx
+        je      L189
+        test    edx, edx
+        je      L189
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ebp, ebx
+        add     ebp, [esp]
+        mov     edx, 1024
+        add     edx, ebx
+        cmp     ebp, edx
+        jle     L188
+        mov     ebp, edx
+L188:   xor     ebp, ebx
+L189:   cmp     edi, esi                    # compare bptr and eptr to see if we're done
+        jb      term_minus_1_loop
+        jmp     term_minus_1_done
+
+OV11:   mov     eax, ebx                    # restore previous sample into eax
+        jmp     long_term_minus_1_loop
+
+OV12:   mov     eax, ebx                    # restore previous sample into eax
+        jmp     L282
+
+        .balign  64
+long_term_minus_1_loop:
+        mov     ebx, eax
+        imul    ecx
+        shl     edx, 22
+        shr     eax, 10
+        adc     eax, edx
+        mov     edx, [edi]
+        add     eax, edx
+        stosd
+        test    ebx, ebx
+        je      L282
+        test    edx, edx
+        je      L282
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ecx, ebx
+        add     ecx, [esp]
+        mov     edx, 1024
+        add     edx, ebx
+        cmp     ecx, edx
+        jle     L283
+        mov     ecx, edx
+L283:   xor     ecx, ebx
+L282:   mov     ebx, eax
+        imul    ebp
+        shl     edx, 22
+        shr     eax, 10
+        adc     eax, edx
+        mov     edx, [edi]
+        add     eax, edx
+        stosd
+        test    ebx, ebx
+        je      L289
+        test    edx, edx
+        je      L289
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ebp, ebx
+        add     ebp, [esp]
+        mov     edx, 1024
+        add     edx, ebx
+        cmp     ebp, edx
+        jle     L288
+        mov     ebp, edx
+L288:   xor     ebp, ebx
+L289:   cmp     edi, esi                    # compare bptr and eptr to see if we're done
+        jb      long_term_minus_1_loop
+
+term_minus_1_done:
+        mov     edx, ebp
+        mov     ebp, esp                    # restore ebp (we've pushed 4 DWORDS)
+        add     ebp, 16
+        mov     eax, [ebp+8]                # point to dpp
+        mov     [eax+8], ecx
+        mov     [eax+12], edx
+        mov     edx, [edi-4]                # dpp->samples_A [0] = bptr [-1]
+        mov     [eax+16], edx
+        jmp     done
+
+
+term_minus_2_entry:
+        mov     eax, [ebp+8]                # point to dpp
+        mov     ecx, [eax+8]                # ecx = weight_A and ebp = weight_B
+        mov     ebp, [eax+12]
+        mov     eax, [edi-8]
+        jmp     term_minus_2_loop
+
+        .balign  64
+term_minus_2_loop:
+        mov     ebx, eax
+        imul    eax, ebp
+        mov     edx, [edi+4]
+        jo      OV21
+        sar     eax, 10
+        adc     eax, edx
+        mov     [edi+4], eax
+        test    ebx, ebx
+        je      L194
+        test    edx, edx
+        je      L194
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ebp, ebx
+        add     ebp, [esp]
+        mov     edx, 1024
+        add     edx, ebx
+        cmp     ebp, edx
+        jle     L195
+        mov     ebp, edx
+L195:   xor     ebp, ebx
+L194:   mov     ebx, eax
+        imul    eax, ecx
+        mov     edx, [edi]
+        jo      OV22
+        sar     eax, 10
+        adc     eax, edx
+        mov     [edi], eax
+        add     edi, 8
+        test    ebx, ebx
+        je      L201
+        test    edx, edx
+        je      L201
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ecx, ebx
+        add     ecx, [esp]
+        mov     edx, 1024
+        add     edx, ebx
+        cmp     ecx, edx
+        jle     L200
+        mov     ecx, edx
+L200:   xor     ecx, ebx
+L201:   cmp     edi, esi                    # compare bptr and eptr to see if we're done
+        jb      term_minus_2_loop
+        jmp     term_minus_2_done
+
+OV21:   mov     eax, ebx                    # restore previous sample into eax
+        jmp     long_term_minus_2_loop
+
+OV22:   mov     eax, ebx                    # restore previous sample into eax
+        jmp     L294
+
+        .balign  64
+long_term_minus_2_loop:
+        mov     ebx, eax
+        imul    ebp
+        shl     edx, 22
+        shr     eax, 10
+        adc     eax, edx
+        mov     edx, [edi+4]
+        add     eax, edx
+        mov     [edi+4], eax
+        test    ebx, ebx
+        je      L294
+        test    edx, edx
+        je      L294
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ebp, ebx
+        add     ebp, [esp]
+        mov     edx, 1024
+        add     edx, ebx
+        cmp     ebp, edx
+        jle     L295
+        mov     ebp, edx
+L295:   xor     ebp, ebx
+L294:   mov     ebx, eax
+        imul    ecx
+        shl     edx, 22
+        shr     eax, 10
+        adc     eax, edx
+        mov     edx, [edi]
+        add     eax, edx
+        mov     [edi], eax
+        add     edi, 8
+        test    ebx, ebx
+        je      L301
+        test    edx, edx
+        je      L301
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ecx, ebx
+        add     ecx, [esp]
+        mov     edx, 1024
+        add     edx, ebx
+        cmp     ecx, edx
+        jle     L300
+        mov     ecx, edx
+L300:   xor     ecx, ebx
+L301:   cmp     edi, esi                    # compare bptr and eptr to see if we're done
+        jb      long_term_minus_2_loop
+
+term_minus_2_done:
+        mov     edx, ebp
+        lea     ebp, [esp+16]               # restore ebp (we've pushed 4 DWORDS)
+        mov     eax, [ebp+8]                # point to dpp
+        mov     [eax+8], ecx
+        mov     [eax+12], edx
+        mov     edx, [edi-8]                # dpp->samples_B [0] = bptr [-2];
+        mov     [eax+48], edx
+        jmp     done
+
+#
+# registers during processing loop for term -3:
+#   edi         active buffer pointer
+#   esi         end of buffer pointer
+#
+# MMX:
+#   mm0, mm1    scratch
+#   mm2         original sample values
+#   mm3         calculated correlation samples
+#   mm4         last calculated values (so we don't need to reload)
+#   mm5         weights
+#   mm6         delta
+#   mm7         512 (for rounding)
+#
+
+term_minus_3_entry:
+        mov     eax, 512
+        movd    mm7, eax
+        punpckldq mm7, mm7                  # mm7 = round (512)
+        mov     edx, [ebp+8]                # point to dpp & get delta
+        mov     eax, [edx+4]
+        movd    mm6, eax
+        punpckldq mm6, mm6                  # mm6 = delta (0-7)
+        mov     eax, 0xFFFF                 # mask high weights to zero for PMADDWD
+        movd    mm5, eax
+        punpckldq mm5, mm5                  # mm5 = weight mask 0x0000FFFF0000FFFF
+        pand    mm5, [edx+8]                # mm5 = weight_AB masked to 16 bits
+        movq    mm4, [edi-8]                # preload previous calculated values
+        jmp     term_minus_3_loop
+
+        .balign  64
+term_minus_3_loop:
+        movq    mm3, mm4                    # mm3 = swap dwords (mm4)
+        psrlq   mm3, 32
+        punpckldq mm3, mm4                  # mm3 = sam_AB
+        movq    mm1, mm3
+        movq    mm4, mm3
+        pslld   mm1, 1
+        psrld   mm4, 15
+        psrlw   mm1, 1
+        pmaddwd mm4, mm5
+        pmaddwd mm1, mm5
+        movq    mm2, [edi]                  # mm2 = left_right
+        pslld   mm4, 5
+        paddd   mm1, mm7                    # add 512 for rounding
+        psrad   mm1, 10
+        paddd   mm4, mm2
+        paddd   mm4, mm1                    # add shifted sums
+        movq    [edi], mm4                  # store result
+        movq    mm0, mm3
+        pxor    mm0, mm2
+        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
+        add     edi, 8
+        pxor    mm1, mm1                    # mm1 = zero
+        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    # mm2 = 1s if either was zero
+        pandn   mm2, mm6                    # mask delta with zeros check
+        pcmpeqd mm1, mm1
+        psubd   mm1, mm7
+        psubd   mm1, mm7
+        psubd   mm1, mm0
+        pxor    mm5, mm0
+        paddw   mm5, mm1
+        paddusw mm5, mm2                    # and add to weight_AB
+        psubw   mm5, mm1
+        pxor    mm5, mm0
+        cmp     edi, esi                    # compare bptr and eptr to see if we're done
+        jb      term_minus_3_loop
+
+        pslld   mm5, 16                     # sign-extend 16-bit weights back to dwords
+        psrad   mm5, 16
+        mov     eax, [ebp+8]                # point to dpp
+        movq    [eax+8], mm5                # put weight_AB back
+        emms
+        mov     edx, [edi-4]                # dpp->samples_A [0] = bptr [-1];
+        mov     eax, [ebp+8]
+        mov     [eax+16], edx
+        mov     edx, [edi-8]                # dpp->samples_B [0] = bptr [-2];
+        mov     [eax+48], edx
+
+done:   pop     eax                         # pop delta & saved regs
+        pop     edi
+        pop     esi
+        pop     ebx
+        pop     ebp
+        ret
+
+#######################################################################################################################
+#
+# This is the mono version of the above function. It does not use MMX and does not handle negative terms.
+#
+# void unpack_decorr_mono_pass_cont (struct decorr_pass *dpp,
+#                                    int32_t *buffer,
+#                                    int32_t sample_count,
+#                                    int32_t long_math;
+# arguments on entry:
+#
+#   struct decorr_pass *dpp     [ebp+8]
+#   int32_t *buffer             [ebp+12]
+#   int32_t sample_count        [ebp+16]
+#   int32_t long_math           [ebp+20]
+#
+# registers after entry:
+#
+#   rdi         bptr
+#   rsi         eptr
+#
+# on stack:
+#
+#   int16_t delta             DWORD [esp]
+#
+
+_unpack_decorr_mono_pass_cont_x86:
+unpack_decorr_mono_pass_cont_x86:
+        push    ebp
+        mov     ebp, esp
+        push    ebx
+        push    esi
+        push    edi
+        cld
+
+        mov     edx, [ebp+8]                # copy delta from dpp to local stack
+        mov     eax, [edx+4]
+        push    eax
+
+        mov     edi, [ebp+12]               # edi = buffer
+        mov     eax, [ebp+16]               # get sample_count and multiply by 4
+        shl     eax, 2
+        jz      mono_done                   # exit now if there's nothing to do
+        lea     esi, [edi+eax]              # else add to buffer point to make eptr
+
+        mov     eax, [ebp+8]                # get term from dpp and vector appropriately
+        mov     eax, [eax]
+        cmp     eax, 17
+        je      mono_17_entry
+        cmp     eax, 18
+        je      mono_18_entry
+
+#
+# registers during default term processing loop:
+#   edi         active buffer pointer
+#   esi         end of buffer pointer
+#   ecx         weight_A
+#   ebp         free
+#   ebx         term * -4
+#   eax,edx     scratch
+#
+
+default_mono_entry:
+        imul    ebx, eax, -4                # set ebx to term * -4 for decorrelation index
+        mov     edx, [ebp+8]                # edx = dpp*
+        mov     ecx, [edx+8]                # ecx = weight
+        jmp     default_mono_loop
+
+#
+# registers during processing loop for terms 17 & 18:
+#   edi         active buffer pointer
+#   esi         end of buffer pointer
+#   ecx         weight_A
+#   ebp         previously calculated value
+#   ebx         calculated correlation sample
+#   eax,edx     scratch
+#
+
+mono_17_entry:
+        mov     edx, [ebp+8]                # edx = dpp*
+        mov     ecx, [edx+8]                # ecx = weight_A
+        mov     ebp, [edi-4]
+        jmp     mono_17_loop
+
+mono_18_entry:
+        mov     edx, [ebp+8]                # edx = dpp*
+        mov     ecx, [edx+8]                # ecx = weight_A
+        mov     ebp, [edi-4]
+        jmp     mono_18_loop
+
+        .balign  64
+default_mono_loop:
+        mov     eax, [edi+ebx]
+        imul    eax, ecx
+        mov     edx, [edi]
+        jo      long_default_mono_loop
+        sar     eax, 10
+        adc     eax, edx
+        mov     [edi], eax
+        mov     eax, [edi+ebx]
+        add     edi, 4
+        test    edx, edx
+        je      L100
+        test    eax, eax
+        je      L100
+        xor     eax, edx
+        cdq
+        xor     ecx, edx
+        add     ecx, [esp]
+        xor     ecx, edx
+L100:   cmp     edi, esi                    # compare bptr and eptr to see if we're done
+        jb      default_mono_loop
+        jmp     default_mono_done
+
+        .balign  64
+long_default_mono_loop:
+        mov     eax, [edi+ebx]
+        imul    ecx
+        shl     edx, 22
+        shr     eax, 10
+        adc     eax, edx
+        mov     edx, [edi]
+        add     eax, edx
+        mov     [edi], eax
+        mov     eax, [edi+ebx]
+        add     edi, 4
+        test    edx, edx
+        je      L101
+        test    eax, eax
+        je      L101
+        xor     eax, edx
+        cdq
+        xor     ecx, edx
+        add     ecx, [esp]
+        xor     ecx, edx
+L101:   cmp     edi, esi                    # compare bptr and eptr to see if we're done
+        jb      long_default_mono_loop
+
+default_mono_done:
+        mov     edx, [ebp+8]                # edx = dpp*
+        mov     [edx+8], ecx                # store weight_A back
+        mov     ecx, [edx]                  # ecx = dpp->term
+
+default_mono_store_samples:
+        dec     ecx
+        sub     edi, 4                      # back up one full sample
+        mov     eax, [edi]
+        mov     [edx+ecx*4+16], eax         # store samples_A [ecx]
+        test    ecx, ecx
+        jnz     default_mono_store_samples
+        jmp     mono_done
+
+        .balign  64
+mono_17_loop:
+        lea     ebx, [ebp+ebp]
+        sub     ebx, [edi-8]
+        mov     eax, ecx
+        imul    eax, ebx
+        mov     edx, [edi]
+        jo      long_mono_17_loop
+        sar     eax, 10
+        adc     eax, edx
+        stosd
+        test    ebx, ebx
+        mov     ebp, eax
+        je      L117
+        test    edx, edx
+        je      L117
+        mov     eax, [esp]
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     eax, ebx
+        sub     eax, ebx
+        add     ecx, eax
+L117:   cmp     edi, esi                    # compare bptr and eptr to see if we're done
+        jb      mono_17_loop
+        jmp     mono_1718_exit
+
+        .balign  64
+long_mono_17_loop:
+        lea     ebx, [ebp+ebp]
+        sub     ebx, [edi-8]
+        mov     eax, ecx
+        imul    ebx
+        shl     edx, 22
+        shr     eax, 10
+        adc     eax, edx
+        mov     edx, [edi]
+        add     eax, edx
+        stosd
+        test    ebx, ebx
+        mov     ebp, eax
+        je      L217
+        test    edx, edx
+        je      L217
+        mov     eax, [esp]
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     eax, ebx
+        sub     eax, ebx
+        add     ecx, eax
+L217:   cmp     edi, esi                    # compare bptr and eptr to see if we're done
+        jb      long_mono_17_loop
+        jmp     mono_1718_exit
+
+        .balign  64
+mono_18_loop:
+        lea     ebx, [ebp+ebp*2]
+        sub     ebx, [edi-8]
+        sar     ebx, 1
+        mov     eax, ecx
+        imul    eax, ebx
+        mov     edx, [edi]
+        jo      long_mono_18_loop
+        sar     eax, 10
+        adc     eax, edx
+        stosd
+        test    ebx, ebx
+        mov     ebp, eax
+        je      L118
+        test    edx, edx
+        je      L118
+        mov     eax, [esp]
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     eax, ebx
+        sub     eax, ebx
+        add     ecx, eax
+L118:   cmp     edi, esi                    # compare bptr and eptr to see if we're done
+        jb      mono_18_loop
+        jmp     mono_1718_exit
+
+        .balign  64
+long_mono_18_loop:
+        lea     ebx, [ebp+ebp*2]
+        sub     ebx, [edi-8]
+        sar     ebx, 1
+        mov     eax, ecx
+        imul    ebx
+        shl     edx, 22
+        shr     eax, 10
+        adc     eax, edx
+        mov     edx, [edi]
+        add     eax, edx
+        stosd
+        test    ebx, ebx
+        mov     ebp, eax
+        je      L218
+        test    edx, edx
+        je      L218
+        mov     eax, [esp]
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     eax, ebx
+        sub     eax, ebx
+        add     ecx, eax
+L218:   cmp     edi, esi                    # compare bptr and eptr to see if we're done
+        jb      long_mono_18_loop
+
+mono_1718_exit:
+        lea     ebp, [esp+16]               # restore ebp (we've pushed 4 DWORDS)
+        mov     edx, [ebp+8]                # edx = dpp*
+        mov     [edx+8], ecx                # store weight_A back
+        mov     eax, [edi-4]                # dpp->samples_A [0] = bptr [-1];
+        mov     [edx+16], eax
+        mov     eax, [edi-8]                # dpp->samples_A [1] = bptr [-2];
+        mov     [edx+20], eax
+
+mono_done:
+        pop     eax                         # pop delta & saved regs
+        pop     edi
+        pop     esi
+        pop     ebx
+        pop     ebp
+        ret
+
+# Helper function to determine if specified CPU feature is available (used here for MMX).
+# Input parameter is index of feature to be checked (EDX from CPUID(1) only, MMX = 23).
+# Return value is the specified bit (0 or 1) or 0 if CPUID is not supported.
+
+_unpack_cpu_has_feature_x86:
+unpack_cpu_has_feature_x86:
+        pushfd                              # save eflags
+        pushfd                              # push another copy
+        xor     dword ptr [esp], 0x200000   # toggle ID bit on stack & pop it back into eflags
+        popfd
+        pushfd                              # store possibly modified eflags
+        pop     eax                         # and pop back into eax
+        xor     eax, [esp]                  # compare to original pushed eflags
+        popfd                               # restore original eflags
+        and     eax, 0x200000               # eax = 1 if eflags ID bit was changable
+        jz      oldcpu                      # return zero if CPUID is not available (wow!)
+
+        push    ebx                         # we must save ebx
+        mov     eax, 1                      # do cpuid (1) to get features into edx
+        cpuid
+        mov     eax, edx                    # copy into eax for shift
+        mov     cl, [esp+8]                 # get parameter and shift that bit index into LSB
+        sar     eax, cl
+        and     eax, 1
+        pop     ebx                         # restore ebx and return 0 or 1
+
+oldcpu: ret                                 # return value in eax
+
+#ifdef __ELF__
+        .section .note.GNU-stack,"",@progbits
+#endif
+
diff --git a/third_party/wavpack/src/unpack_x86.asm b/third_party/wavpack/src/unpack_x86.asm
new file mode 100644
index 0000000..1d99155
--- /dev/null
+++ b/third_party/wavpack/src/unpack_x86.asm
@@ -0,0 +1,958 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;                           **** WAVPACK ****                            ;;
+;;                  Hybrid Lossless Wavefile Compressor                   ;;
+;;              Copyright (c) 1998 - 2015 Conifer Software.               ;;
+;;                          All Rights Reserved.                          ;;
+;;      Distributed under the BSD Software License (see license.txt)      ;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+        .686
+        .mmx
+        .model  flat
+asmcode segment page 'CODE'
+        public  _unpack_decorr_stereo_pass_cont_x86
+        public  _unpack_decorr_mono_pass_cont_x86
+        public  _unpack_cpu_has_feature_x86
+
+; This is an assembly optimized version of the following WavPack function:
+;
+; void unpack_decorr_stereo_pass_cont (struct decorr_pass *dpp,
+;                                      int32_t *buffer,
+;                                      int32_t sample_count,
+;                                      int32_t long_math;
+;
+; It performs a single pass of stereo decorrelation on the provided buffer.
+; Note that this version of the function requires that up to 8 previous
+; stereo samples are visible and correct. In other words, it ignores the
+; "samples_*" fields in the decorr_pass structure and gets the history data
+; directly from the buffer. It does, however, return the appropriate history
+; samples to the decorr_pass structure before returning.
+;
+; The "long_math" argument is used to specify that a 32-bit multiply is
+; not enough for the "apply_weight" operation (although in this case it
+; would only apply to the -1 and -2 terms because the MMX code does not have
+; this limitation) but we ignore the parameter and use the overflow detection
+; of the "imul" instruction to switch automatically to the "long_math" loop.
+;
+; This is written to work on an IA-32 processor and uses the MMX extensions
+; to improve the performance by processing both stereo channels together.
+; For terms -1 and -2 the MMX extensions are not usable, and so these are
+; performed independently without them.
+;
+; arguments on entry:
+;
+;   struct decorr_pass *dpp     [ebp+8]
+;   int32_t *buffer             [ebp+12]
+;   int32_t sample_count        [ebp+16]
+;   int32_t long_math           [ebp+20]
+;
+; registers after entry:
+;
+;   rdi         bptr
+;   rsi         eptr
+;
+; on stack (used for terms -1 and -2 only):
+; 
+;   int32_t delta             DWORD [esp]
+;
+
+_unpack_decorr_stereo_pass_cont_x86:
+        push    ebp
+        mov     ebp, esp
+        push    ebx
+        push    esi
+        push    edi
+
+        mov     edx, [ebp+8]                ; copy delta from dpp to top of stack
+        mov     eax, [edx+4]
+        push    eax
+
+        mov     edi, [ebp+12]               ; edi = buffer
+        mov     eax, [ebp+16]               ; get sample_count and divide by 8
+        sal     eax, 3
+        jz      done                        ; exit now if there's nothing to do
+
+        add     eax, edi                    ; else add to buffer point to make eptr
+        mov     esi, eax
+    
+        mov     eax, [ebp+8]                ; get term from dpp and vector appropriately
+        mov     eax, [eax]
+        cmp     eax, 17
+        je      term_17_entry
+        cmp     eax, 18
+        je      term_18_entry
+        cmp     eax, -1
+        je      term_minus_1_entry
+        cmp     eax, -2
+        je      term_minus_2_entry
+        cmp     eax, -3
+        je      term_minus_3_entry
+
+;
+; registers during default term processing loop:
+;   edi         active buffer pointer
+;   esi         end of buffer pointer
+;
+; MMX:
+;   mm0, mm1    scratch
+;   mm2         original sample values
+;   mm3         correlation samples
+;   mm4         zero (for pcmpeqd)
+;   mm5         weights
+;   mm6         delta
+;   mm7         512 (for rounding)
+;
+
+default_term_entry:
+        imul    ebx, eax, -8                ; set ebx to term * -8 for decorrelation index
+        mov     eax, 512
+        movd    mm7, eax
+        punpckldq mm7, mm7                  ; mm7 = round (512)
+        mov     edx, [ebp+8]                ; edx = *dpp
+        mov     eax, [edx+4]
+        movd    mm6, eax
+        punpckldq mm6, mm6                  ; mm6 = delta (0-7)
+        mov     eax, 0FFFFh                 ; mask high weights to zero for PMADDWD
+        movd    mm5, eax
+        punpckldq mm5, mm5                  ; mm5 = weight mask 0x0000FFFF0000FFFF
+        pand    mm5, [edx+8]                ; mm5 = weight_AB masked to 16 bits
+        pxor    mm4, mm4                    ; mm4 = zero (for pcmpeqd)
+        jmp     default_term_loop
+
+        align  64
+default_term_loop:
+        movq    mm3, [edi+ebx]              ; mm3 = sam_AB
+        movq    mm1, mm3
+        movq    mm0, mm3
+        paddd   mm1, mm1
+        psrld   mm0, 15
+        psrlw   mm1, 1
+        pmaddwd mm0, mm5
+        pmaddwd mm1, mm5
+        movq    mm2, [edi]                  ; mm2 = left_right
+        pslld   mm0, 5
+        paddd   mm1, mm7                    ; add 512 for rounding
+        psrad   mm1, 10
+        paddd   mm0, mm2
+        paddd   mm0, mm1                    ; add shifted sums
+        movq    [edi], mm0                  ; store result
+        movq    mm0, mm3
+        pxor    mm0, mm2
+        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
+        add     edi, 8
+        pcmpeqd mm2, mm4                    ; mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm4                    ; mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    ; mm2 = 1s if either was zero
+        pandn   mm2, mm6                    ; mask delta with zeros check
+        pxor    mm5, mm0
+        paddw   mm5, mm2                    ; and add to weight_AB
+        pxor    mm5, mm0
+        cmp     edi, esi                    ; compare bptr and eptr to see if we're done
+        jb      default_term_loop
+
+        pslld   mm5, 16                     ; sign-extend 16-bit weights back to dwords
+        psrad   mm5, 16
+        mov     eax, [ebp+8]                ; point to dpp
+        movq    [eax+8], mm5                ; put weight_AB back
+        emms
+        mov     edx, [ebp+8]                ; access dpp with edx
+        mov     ecx, [edx]                  ; ecx = dpp->term
+
+default_store_samples:
+        dec     ecx
+        sub     edi, 8                      ; back up one full sample
+        mov     eax, [edi+4]
+        mov     [edx+ecx*4+48], eax         ; store samples_B [ecx]
+        mov     eax, [edi]
+        mov     [edx+ecx*4+16], eax         ; store samples_A [ecx]
+        test    ecx, ecx
+        jnz     default_store_samples
+
+        jmp     done
+
+;
+; registers during processing loop for terms 17 & 18:
+;   edi         active buffer pointer
+;   esi         end of buffer pointer
+;
+; MMX:
+;   mm0, mm1    scratch
+;   mm2         original sample values
+;   mm3         calculated correlation samples
+;   mm4         last calculated values (so we don't need to reload)
+;   mm5         weights
+;   mm6         delta
+;   mm7         512 (for rounding)
+;
+
+term_17_entry:
+        mov     eax, 512
+        movd    mm7, eax
+        punpckldq mm7, mm7                  ; mm7 = round (512)
+        mov     edx, [ebp+8]                ; point to dpp & get delta
+        mov     eax, [edx+4]
+        movd    mm6, eax
+        punpckldq mm6, mm6                  ; mm6 = delta (0-7)
+        mov     eax, 0FFFFh                 ; mask high weights to zero for PMADDWD
+        movd    mm5, eax
+        punpckldq mm5, mm5                  ; mm5 = weight mask 0x0000FFFF0000FFFF
+        pand    mm5, [edx+8]                ; mm5 = weight_AB masked to 16 bits
+        movq    mm4, [edi-8]                ; preload previous calculated values
+        jmp     term_17_loop
+
+        align  64
+term_17_loop:
+        paddd   mm4, mm4
+        psubd   mm4, [edi-16]               ; mm3 = sam_AB
+        movq    mm3, mm4
+        movq    mm1, mm3
+        paddd   mm1, mm1
+        psrld   mm4, 15
+        psrlw   mm1, 1
+        pmaddwd mm4, mm5
+        pmaddwd mm1, mm5
+        movq    mm2, [edi]                  ; mm2 = left_right
+        pslld   mm4, 5
+        paddd   mm1, mm7                    ; add 512 for rounding
+        psrad   mm1, 10
+        paddd   mm4, mm2
+        paddd   mm4, mm1                    ; add shifted sums
+        movq    mm0, mm3
+        movq    [edi], mm4                  ; store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
+        add     edi, 8
+        pxor    mm1, mm1                    ; mm1 = zero
+        pcmpeqd mm2, mm1                    ; mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    ; mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    ; mm2 = 1s if either was zero
+        pandn   mm2, mm6                    ; mask delta with zeros check
+        pxor    mm5, mm0
+        paddw   mm5, mm2                    ; and add to weight_AB
+        pxor    mm5, mm0
+        cmp     edi, esi                    ; compare bptr and eptr to see if we're done
+        jb      term_17_loop
+
+        pslld   mm5, 16                     ; sign-extend 16-bit weights back to dwords
+        psrad   mm5, 16
+        mov     eax, [ebp+8]                ; point to dpp
+        movq    [eax+8], mm5                ; put weight_AB back
+        emms
+        jmp     term_1718_exit
+
+term_18_entry:
+        mov     eax, 512
+        movd    mm7, eax
+        punpckldq mm7, mm7                  ; mm7 = round (512)
+        mov     edx, [ebp+8]                ; point to dpp & get delta
+        mov     eax, [edx+4]
+        movd    mm6, eax
+        punpckldq mm6, mm6                  ; mm6 = delta (0-7)
+        mov     eax, 0FFFFh                 ; mask high weights to zero for PMADDWD
+        movd    mm5, eax
+        punpckldq mm5, mm5                  ; mm5 = weight mask 0x0000FFFF0000FFFF
+        pand    mm5, [edx+8]                ; mm5 = weight_AB masked to 16 bits
+        movq    mm4, [edi-8]                ; preload previous calculated value
+        jmp     term_18_loop
+
+        align  64
+term_18_loop:
+        movq    mm3, mm4
+        psubd   mm3, [edi-16]
+        psrad   mm3, 1
+        paddd   mm3, mm4                    ; mm3 = sam_AB
+        movq    mm1, mm3
+        movq    mm4, mm3
+        paddd   mm1, mm1
+        psrld   mm4, 15
+        psrlw   mm1, 1
+        pmaddwd mm4, mm5
+        pmaddwd mm1, mm5
+        movq    mm2, [edi]                  ; mm2 = left_right
+        pslld   mm4, 5
+        paddd   mm1, mm7                    ; add 512 for rounding
+        psrad   mm1, 10
+        paddd   mm4, mm2
+        paddd   mm4, mm1                    ; add shifted sums
+        movq    mm0, mm3
+        movq    [edi], mm4                  ; store result
+        pxor    mm0, mm2
+        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
+        add     edi, 8
+        pxor    mm1, mm1                    ; mm1 = zero
+        pcmpeqd mm2, mm1                    ; mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    ; mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    ; mm2 = 1s if either was zero
+        pandn   mm2, mm6                    ; mask delta with zeros check
+        pxor    mm5, mm0
+        paddw   mm5, mm2                    ; and add to weight_AB
+        pxor    mm5, mm0
+        cmp     edi, esi                    ; compare bptr and eptr to see if we're done
+        jb      term_18_loop
+
+        pslld   mm5, 16                     ; sign-extend 16-bit weights back to dwords
+        psrad   mm5, 16
+        mov     eax, [ebp+8]                ; point to dpp
+        movq    [eax+8], mm5                ; put weight_AB back
+        emms
+
+term_1718_exit:
+        mov     edx, [edi-4]                ; dpp->samples_B [0] = bptr [-1];
+        mov     eax, [ebp+8]
+        mov     [eax+48], edx
+        mov     edx, [edi-8]                ; dpp->samples_A [0] = bptr [-2];
+        mov     [eax+16], edx
+        mov     edx, [edi-12]               ; dpp->samples_B [1] = bptr [-3];
+        mov     [eax+52], edx
+        mov     edx, [edi-16]               ; dpp->samples_A [1] = bptr [-4];
+        mov     [eax+20], edx
+        jmp     done
+
+;
+; registers in term -1 & -2 loops:
+;
+;   eax,ebx,edx scratch
+;   ecx         weight_A
+;   ebp         weight_B
+;   edi         bptr
+;   esi         eptr
+;
+
+term_minus_1_entry:
+        cld                                 ; we use stosd here...
+        mov     eax, [ebp+8]                ; point to dpp
+        mov     ecx, [eax+8]                ; ecx = weight_A and ebp = weight_B
+        mov     ebp, [eax+12]
+        mov     eax, [edi-4]
+        jmp     term_minus_1_loop
+
+        align  64
+term_minus_1_loop:
+        mov     ebx, eax
+        imul    eax, ecx
+        mov     edx, [edi]
+        jo      OV11
+        sar     eax, 10
+        adc     eax, edx
+        stosd
+        test    ebx, ebx
+        je      L182
+        test    edx, edx
+        je      L182
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ecx, ebx
+        add     ecx, [esp]
+        mov     edx, 1024
+        add     edx, ebx
+        cmp     ecx, edx
+        jle     L183
+        mov     ecx, edx
+L183:   xor     ecx, ebx
+L182:   mov     ebx, eax
+        imul    eax, ebp
+        mov     edx, [edi]
+        jo      OV12
+        sar     eax, 10
+        adc     eax, edx
+        stosd
+        test    ebx, ebx
+        je      L189
+        test    edx, edx
+        je      L189
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ebp, ebx
+        add     ebp, [esp]
+        mov     edx, 1024
+        add     edx, ebx
+        cmp     ebp, edx
+        jle     L188
+        mov     ebp, edx
+L188:   xor     ebp, ebx
+L189:   cmp     edi, esi                    ; compare bptr and eptr to see if we're done
+        jb      term_minus_1_loop
+        jmp     term_minus_1_done
+
+OV11:   mov     eax, ebx                    ; restore previous sample into eax
+        jmp     long_term_minus_1_loop
+
+OV12:   mov     eax, ebx                    ; restore previous sample into eax
+        jmp     L282
+
+        align  64
+long_term_minus_1_loop:
+        mov     ebx, eax
+        imul    ecx
+        shl     edx, 22
+        shr     eax, 10
+        adc     eax, edx
+        mov     edx, [edi]
+        add     eax, edx
+        stosd
+        test    ebx, ebx
+        je      L282
+        test    edx, edx
+        je      L282
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ecx, ebx
+        add     ecx, [esp]
+        mov     edx, 1024
+        add     edx, ebx
+        cmp     ecx, edx
+        jle     L283
+        mov     ecx, edx
+L283:   xor     ecx, ebx
+L282:   mov     ebx, eax
+        imul    ebp
+        shl     edx, 22
+        shr     eax, 10
+        adc     eax, edx
+        mov     edx, [edi]
+        add     eax, edx
+        stosd
+        test    ebx, ebx
+        je      L289
+        test    edx, edx
+        je      L289
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ebp, ebx
+        add     ebp, [esp]
+        mov     edx, 1024
+        add     edx, ebx
+        cmp     ebp, edx
+        jle     L288
+        mov     ebp, edx
+L288:   xor     ebp, ebx
+L289:   cmp     edi, esi                    ; compare bptr and eptr to see if we're done
+        jb      long_term_minus_1_loop
+
+term_minus_1_done:
+        mov     edx, ebp
+        mov     ebp, esp                    ; restore ebp (we've pushed 4 DWORDS)
+        add     ebp, 16
+        mov     eax, [ebp+8]                ; point to dpp
+        mov     [eax+8], ecx
+        mov     [eax+12], edx
+        mov     edx, [edi-4]                ; dpp->samples_A [0] = bptr [-1]
+        mov     [eax+16], edx
+        jmp     done
+
+
+term_minus_2_entry:
+        mov     eax, [ebp+8]                ; point to dpp
+        mov     ecx, [eax+8]                ; ecx = weight_A and ebp = weight_B
+        mov     ebp, [eax+12]
+        mov     eax, [edi-8]
+        jmp     term_minus_2_loop
+
+        align  64
+term_minus_2_loop:
+        mov     ebx, eax
+        imul    eax, ebp
+        mov     edx, [edi+4]
+        jo      OV21
+        sar     eax, 10
+        adc     eax, edx
+        mov     [edi+4], eax
+        test    ebx, ebx
+        je      L194
+        test    edx, edx
+        je      L194
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ebp, ebx
+        add     ebp, [esp]
+        mov     edx, 1024
+        add     edx, ebx
+        cmp     ebp, edx
+        jle     L195
+        mov     ebp, edx
+L195:   xor     ebp, ebx
+L194:   mov     ebx, eax
+        imul    eax, ecx
+        mov     edx, [edi]
+        jo      OV22
+        sar     eax, 10
+        adc     eax, edx
+        mov     [edi], eax
+        add     edi, 8
+        test    ebx, ebx
+        je      L201
+        test    edx, edx
+        je      L201
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ecx, ebx
+        add     ecx, [esp]
+        mov     edx, 1024
+        add     edx, ebx
+        cmp     ecx, edx
+        jle     L200
+        mov     ecx, edx
+L200:   xor     ecx, ebx
+L201:   cmp     edi, esi                    ; compare bptr and eptr to see if we're done
+        jb      term_minus_2_loop
+        jmp     term_minus_2_done
+
+OV21:   mov     eax, ebx                    ; restore previous sample into eax
+        jmp     long_term_minus_2_loop
+
+OV22:   mov     eax, ebx                    ; restore previous sample into eax
+        jmp     L294
+
+        align  64
+long_term_minus_2_loop:
+        mov     ebx, eax
+        imul    ebp
+        shl     edx, 22
+        shr     eax, 10
+        adc     eax, edx
+        mov     edx, [edi+4]
+        add     eax, edx
+        mov     [edi+4], eax
+        test    ebx, ebx
+        je      L294
+        test    edx, edx
+        je      L294
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ebp, ebx
+        add     ebp, [esp]
+        mov     edx, 1024
+        add     edx, ebx
+        cmp     ebp, edx
+        jle     L295
+        mov     ebp, edx
+L295:   xor     ebp, ebx
+L294:   mov     ebx, eax
+        imul    ecx
+        shl     edx, 22
+        shr     eax, 10
+        adc     eax, edx
+        mov     edx, [edi]
+        add     eax, edx
+        mov     [edi], eax
+        add     edi, 8
+        test    ebx, ebx
+        je      L301
+        test    edx, edx
+        je      L301
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     ecx, ebx
+        add     ecx, [esp]
+        mov     edx, 1024
+        add     edx, ebx
+        cmp     ecx, edx
+        jle     L300
+        mov     ecx, edx
+L300:   xor     ecx, ebx
+L301:   cmp     edi, esi                    ; compare bptr and eptr to see if we're done
+        jb      long_term_minus_2_loop
+
+term_minus_2_done:
+        mov     edx, ebp
+        lea     ebp, [esp+16]               ; restore ebp (we've pushed 4 DWORDS)
+        mov     eax, [ebp+8]                ; point to dpp
+        mov     [eax+8], ecx
+        mov     [eax+12], edx
+        mov     edx, [edi-8]                ; dpp->samples_B [0] = bptr [-2];
+        mov     [eax+48], edx
+        jmp     done
+
+;
+; registers during processing loop for term -3:
+;   edi         active buffer pointer
+;   esi         end of buffer pointer
+;
+; MMX:
+;   mm0, mm1    scratch
+;   mm2         original sample values
+;   mm3         calculated correlation samples
+;   mm4         last calculated values (so we don't need to reload)
+;   mm5         weights
+;   mm6         delta
+;   mm7         512 (for rounding)
+;
+
+term_minus_3_entry:
+        mov     eax, 512
+        movd    mm7, eax
+        punpckldq mm7, mm7                  ; mm7 = round (512)
+        mov     edx, [ebp+8]                ; point to dpp & get delta
+        mov     eax, [edx+4]
+        movd    mm6, eax
+        punpckldq mm6, mm6                  ; mm6 = delta (0-7)
+        mov     eax, 0FFFFh                 ; mask high weights to zero for PMADDWD
+        movd    mm5, eax
+        punpckldq mm5, mm5                  ; mm5 = weight mask 0x0000FFFF0000FFFF
+        pand    mm5, [edx+8]                ; mm5 = weight_AB masked to 16 bits
+        movq    mm4, [edi-8]                ; preload previous calculated values
+        jmp     term_minus_3_loop
+
+        align  64
+term_minus_3_loop:
+        movq    mm3, mm4                    ; mm3 = swap dwords (mm4)
+        psrlq   mm3, 32
+        punpckldq mm3, mm4                  ; mm3 = sam_AB
+        movq    mm1, mm3
+        movq    mm4, mm3
+        pslld   mm1, 1
+        psrld   mm4, 15
+        psrlw   mm1, 1
+        pmaddwd mm4, mm5
+        pmaddwd mm1, mm5
+        movq    mm2, [edi]                  ; mm2 = left_right
+        pslld   mm4, 5
+        paddd   mm1, mm7                    ; add 512 for rounding
+        psrad   mm1, 10
+        paddd   mm4, mm2
+        paddd   mm4, mm1                    ; add shifted sums
+        movq    [edi], mm4                  ; store result
+        movq    mm0, mm3
+        pxor    mm0, mm2
+        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
+        add     edi, 8
+        pxor    mm1, mm1                    ; mm1 = zero
+        pcmpeqd mm2, mm1                    ; mm2 = 1s if left_right was zero
+        pcmpeqd mm3, mm1                    ; mm3 = 1s if sam_AB was zero
+        por     mm2, mm3                    ; mm2 = 1s if either was zero
+        pandn   mm2, mm6                    ; mask delta with zeros check
+        pcmpeqd mm1, mm1
+        psubd   mm1, mm7
+        psubd   mm1, mm7
+        psubd   mm1, mm0
+        pxor    mm5, mm0
+        paddw   mm5, mm1
+        paddusw mm5, mm2                    ; and add to weight_AB
+        psubw   mm5, mm1
+        pxor    mm5, mm0
+        cmp     edi, esi                    ; compare bptr and eptr to see if we're done
+        jb      term_minus_3_loop
+
+        pslld   mm5, 16                     ; sign-extend 16-bit weights back to dwords
+        psrad   mm5, 16
+        mov     eax, [ebp+8]                ; point to dpp
+        movq    [eax+8], mm5                ; put weight_AB back
+        emms
+        mov     edx, [edi-4]                ; dpp->samples_A [0] = bptr [-1];
+        mov     eax, [ebp+8]
+        mov     [eax+16], edx
+        mov     edx, [edi-8]                ; dpp->samples_B [0] = bptr [-2];
+        mov     [eax+48], edx
+
+done:   pop     eax                         ; pop delta & saved regs
+        pop     edi
+        pop     esi
+        pop     ebx
+        pop     ebp
+        ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; This is the mono version of the above function. It does not use MMX and does not handle negative terms.
+;
+; void unpack_decorr_mono_pass_cont (struct decorr_pass *dpp,
+;                                    int32_t *buffer,
+;                                    int32_t sample_count,
+;                                    int32_t long_math;
+; arguments on entry:
+;
+;   struct decorr_pass *dpp     [ebp+8]
+;   int32_t *buffer             [ebp+12]
+;   int32_t sample_count        [ebp+16]
+;   int32_t long_math           [ebp+20]
+;
+; registers after entry:
+;
+;   rdi         bptr
+;   rsi         eptr
+;
+; on stack:
+;
+;   int16_t delta             DWORD [esp]
+;
+
+_unpack_decorr_mono_pass_cont_x86:
+        push    ebp
+        mov     ebp, esp
+        push    ebx
+        push    esi
+        push    edi
+        cld
+
+        mov     edx, [ebp+8]                ; copy delta from dpp to local stack
+        mov     eax, [edx+4]
+        push    eax
+
+        mov     edi, [ebp+12]               ; edi = buffer
+        mov     eax, [ebp+16]               ; get sample_count and multiply by 4
+        sal     eax, 2
+        jz      mono_done                   ; exit now if there's nothing to do
+        lea     esi, [edi+eax]              ; else add to buffer point to make eptr
+
+        mov     eax, [ebp+8]                ; get term from dpp and vector appropriately
+        mov     eax, [eax]
+        cmp     eax, 17
+        je      mono_17_entry
+        cmp     eax, 18
+        je      mono_18_entry
+
+;
+; registers during default term processing loop:
+;   edi         active buffer pointer
+;   esi         end of buffer pointer
+;   ecx         weight_A
+;   ebp         free
+;   ebx         term * -4
+;   eax,edx     scratch
+;
+
+default_mono_entry:
+        imul    ebx, eax, -4                ; set ebx to term * -4 for decorrelation index
+        mov     edx, [ebp+8]                ; edx = dpp*
+        mov     ecx, [edx+8]                ; ecx = weight
+        jmp     default_mono_loop
+
+;
+; registers during processing loop for terms 17 & 18:
+;   edi         active buffer pointer
+;   esi         end of buffer pointer
+;   ecx         weight_A
+;   ebp         previously calculated value
+;   ebx         calculated correlation sample
+;   eax,edx     scratch
+;
+
+mono_17_entry:
+        mov     edx, [ebp+8]                ; edx = dpp*
+        mov     ecx, [edx+8]                ; ecx = weight_A
+        mov     ebp, [edi-4]
+        jmp     mono_17_loop
+
+mono_18_entry:
+        mov     edx, [ebp+8]                ; edx = dpp*
+        mov     ecx, [edx+8]                ; ecx = weight_A
+        mov     ebp, [edi-4]
+        jmp     mono_18_loop
+
+        align  64
+default_mono_loop:
+        mov     eax, [edi+ebx]
+        imul    eax, ecx
+        mov     edx, [edi]
+        jo      long_default_mono_loop
+        sar     eax, 10
+        adc     eax, edx
+        mov     [edi], eax
+        mov     eax, [edi+ebx]
+        add     edi, 4
+        test    edx, edx
+        je      L100
+        test    eax, eax
+        je      L100
+        xor     eax, edx
+        cdq
+        xor     ecx, edx
+        add     ecx, [esp]
+        xor     ecx, edx
+L100:   cmp     edi, esi                    ; compare bptr and eptr to see if we're done
+        jb      default_mono_loop
+        jmp     default_mono_done
+
+        align  64
+long_default_mono_loop:
+        mov     eax, [edi+ebx]
+        imul    ecx
+        shl     edx, 22
+        shr     eax, 10
+        adc     eax, edx
+        mov     edx, [edi]
+        add     eax, edx
+        mov     [edi], eax
+        mov     eax, [edi+ebx]
+        add     edi, 4
+        test    edx, edx
+        je      L101
+        test    eax, eax
+        je      L101
+        xor     eax, edx
+        cdq
+        xor     ecx, edx
+        add     ecx, [esp]
+        xor     ecx, edx
+L101:   cmp     edi, esi                    ; compare bptr and eptr to see if we're done
+        jb      long_default_mono_loop
+
+default_mono_done:
+        mov     edx, [ebp+8]                ; edx = dpp*
+        mov     [edx+8], ecx                ; store weight_A back
+        mov     ecx, [edx]                  ; ecx = dpp->term
+
+default_mono_store_samples:
+        dec     ecx
+        sub     edi, 4                      ; back up one full sample
+        mov     eax, [edi]
+        mov     [edx+ecx*4+16], eax         ; store samples_A [ecx]
+        test    ecx, ecx
+        jnz     default_mono_store_samples
+        jmp     mono_done
+
+        align  64
+mono_17_loop:
+        lea     ebx, [ebp+ebp]
+        sub     ebx, [edi-8]
+        mov     eax, ecx
+        imul    eax, ebx
+        mov     edx, [edi]
+        jo      long_mono_17_loop
+        sar     eax, 10
+        adc     eax, edx
+        stosd
+        test    ebx, ebx
+        mov     ebp, eax
+        je      L117
+        test    edx, edx
+        je      L117
+        mov     eax, [esp]
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     eax, ebx
+        sub     eax, ebx
+        add     ecx, eax
+L117:   cmp     edi, esi                    ; compare bptr and eptr to see if we're done
+        jb      mono_17_loop
+        jmp     mono_1718_exit
+
+        align  64
+long_mono_17_loop:
+        lea     ebx, [ebp+ebp]
+        sub     ebx, [edi-8]
+        mov     eax, ecx
+        imul    ebx
+        shl     edx, 22
+        shr     eax, 10
+        adc     eax, edx
+        mov     edx, [edi]
+        add     eax, edx
+        stosd
+        test    ebx, ebx
+        mov     ebp, eax
+        je      L217
+        test    edx, edx
+        je      L217
+        mov     eax, [esp]
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     eax, ebx
+        sub     eax, ebx
+        add     ecx, eax
+L217:   cmp     edi, esi                    ; compare bptr and eptr to see if we're done
+        jb      long_mono_17_loop
+        jmp     mono_1718_exit
+
+        align  64
+mono_18_loop:
+        lea     ebx, [ebp+ebp*2]
+        sub     ebx, [edi-8]
+        sar     ebx, 1
+        mov     eax, ecx
+        imul    eax, ebx
+        mov     edx, [edi]
+        jo      long_mono_18_loop
+        sar     eax, 10
+        adc     eax, edx
+        stosd
+        test    ebx, ebx
+        mov     ebp, eax
+        je      L118
+        test    edx, edx
+        je      L118
+        mov     eax, [esp]
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     eax, ebx
+        sub     eax, ebx
+        add     ecx, eax
+L118:   cmp     edi, esi                    ; compare bptr and eptr to see if we're done
+        jb      mono_18_loop
+        jmp     mono_1718_exit
+
+        align  64
+long_mono_18_loop:
+        lea     ebx, [ebp+ebp*2]
+        sub     ebx, [edi-8]
+        sar     ebx, 1
+        mov     eax, ecx
+        imul    ebx
+        shl     edx, 22
+        shr     eax, 10
+        adc     eax, edx
+        mov     edx, [edi]
+        add     eax, edx
+        stosd
+        test    ebx, ebx
+        mov     ebp, eax
+        je      L218
+        test    edx, edx
+        je      L218
+        mov     eax, [esp]
+        xor     ebx, edx
+        sar     ebx, 31
+        xor     eax, ebx
+        sub     eax, ebx
+        add     ecx, eax
+L218:   cmp     edi, esi                    ; compare bptr and eptr to see if we're done
+        jb      long_mono_18_loop
+
+mono_1718_exit:
+        lea     ebp, [esp+16]               ; restore ebp (we've pushed 4 DWORDS)
+        mov     edx, [ebp+8]                ; edx = dpp*
+        mov     [edx+8], ecx                ; store weight_A back
+        mov     eax, [edi-4]                ; dpp->samples_A [0] = bptr [-1];
+        mov     [edx+16], eax
+        mov     eax, [edi-8]                ; dpp->samples_A [1] = bptr [-2];
+        mov     [edx+20], eax
+
+mono_done:
+        pop     eax                         ; pop delta & saved regs
+        pop     edi
+        pop     esi
+        pop     ebx
+        pop     ebp
+        ret
+
+; Helper function to determine if specified CPU feature is available (used here for MMX).
+; Input parameter is index of feature to be checked (EDX from CPUID(1) only, MMX = 23).
+; Return value is the specified bit (0 or 1) or 0 if CPUID is not supported.
+
+_unpack_cpu_has_feature_x86:
+        pushfd                              ; save eflags
+        pushfd                              ; push another copy
+        xor     dword ptr [esp], 200000h    ; toggle ID bit on stack & pop it back into eflags
+        popfd
+        pushfd                              ; store possibly modified eflags
+        pop     eax                         ; and pop back into eax
+        xor     eax, [esp]                  ; compare to original pushed eflags
+        popfd                               ; restore original eflags
+        and     eax, 200000h                ; eax = 1 if eflags ID bit was changable
+        jz      oldcpu                      ; return zero if CPUID is not available (wow!)
+
+        push    ebx                         ; we must save ebx
+        mov     eax, 1                      ; do cpuid (1) to get features into edx
+        cpuid
+        mov     eax, edx                    ; copy into eax for shift
+        mov     cl, [esp+8]                 ; get parameter and shift that bit index into LSB
+        sar     eax, cl
+        and     eax, 1
+        pop     ebx                         ; restore ebx and return 0 or 1
+
+oldcpu: ret                                 ; return value in eax
+
+asmcode ends
+
+        end
+
diff --git a/third_party/wavpack/src/wavpack_local.h b/third_party/wavpack/src/wavpack_local.h
index 5c69108..fc75628 100644
--- a/third_party/wavpack/src/wavpack_local.h
+++ b/third_party/wavpack/src/wavpack_local.h
@@ -1,7 +1,7 @@
 ////////////////////////////////////////////////////////////////////////////
 //                           **** WAVPACK ****                            //
 //                  Hybrid Lossless Wavefile Compressor                   //
-//              Copyright (c) 1998 - 2006 Conifer Software.               //
+//              Copyright (c) 1998 - 2013 Conifer Software.               //
 //                          All Rights Reserved.                          //
 //      Distributed under the BSD Software License (see license.txt)      //
 ////////////////////////////////////////////////////////////////////////////
@@ -11,19 +11,17 @@
 #ifndef WAVPACK_LOCAL_H
 #define WAVPACK_LOCAL_H
 
-#ifndef __has_builtin
-#define __has_builtin(x) 0
-#endif
-
-#if defined(WIN32)
+#if defined(_WIN32)
+#define strdup(x) _strdup(x)
 #define FASTCALL __fastcall
 #else
 #define FASTCALL
 #endif
 
-#if defined(WIN32) || \
-    (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && (BYTE_ORDER == LITTLE_ENDIAN))
-#define BITSTREAM_SHORTS    // use "shorts" for reading/writing bitstreams
+#if defined(_WIN32) || \
+    (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && (BYTE_ORDER == LITTLE_ENDIAN)) || \
+    (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
+#define BITSTREAM_SHORTS    // use 16-bit "shorts" for reading/writing bitstreams (instead of chars)
                             //  (only works on little-endian machines)
 #endif
 
@@ -31,7 +29,7 @@
 
 // This header file contains all the definitions required by WavPack.
 
-#if defined(_WIN32) && !defined(__MINGW32__)
+#if defined(_MSC_VER) && _MSC_VER < 1600
 #include <stdlib.h>
 typedef unsigned __int64 uint64_t;
 typedef unsigned __int32 uint32_t;
@@ -41,14 +39,13 @@ typedef __int64 int64_t;
 typedef __int32 int32_t;
 typedef __int16 int16_t;
 typedef __int8  int8_t;
-typedef float float32_t;
 #else
-#include <inttypes.h>
+#include <stdint.h>
 #endif
 
 // Because the C99 specification states that "The order of allocation of
-// bit-ﬁelds within a unit (high-order to low-order or low-order to
-// high-order) is implementation-deﬁned" (6.7.2.1), I decided to change
+// bit-fields within a unit (high-order to low-order or low-order to
+// high-order) is implementation-defined" (6.7.2.1), I decided to change
 // the representation of floating-point values from a structure of
 // bit-fields to a 32-bit integer with access macros. Note that the WavPack
 // library doesn't use any floating-point math to implement compression of
@@ -58,6 +55,7 @@ typedef float float32_t;
 typedef int32_t f32;
 
 #define get_mantissa(f)     ((f) & 0x7fffff)
+#define get_magnitude(f)    ((f) & 0x7fffffff)
 #define get_exponent(f)     (((f) >> 23) & 0xff)
 #define get_sign(f)         (((f) >> 31) & 0x1)
 
@@ -92,7 +90,8 @@ typedef struct {
 #define APE_TAG_MAX_LENGTH      (1024 * 1024 * 16)
 
 typedef struct {
-    int32_t tag_file_pos, tag_begins_file;
+    int64_t tag_file_pos;
+    int tag_begins_file;
     ID3_Tag id3_tag;
     APE_Tag_Hdr ape_tag_hdr;
     unsigned char *ape_tag_data;
@@ -115,12 +114,12 @@ typedef struct {
 #define ChunkHeaderFormat "4L"
 
 typedef struct {
-    unsigned short FormatTag, NumChannels;
+    uint16_t FormatTag, NumChannels;
     uint32_t SampleRate, BytesPerSecond;
-    unsigned short BlockAlign, BitsPerSample;
-    unsigned short cbSize, ValidBitsPerSample;
+    uint16_t BlockAlign, BitsPerSample;
+    uint16_t cbSize, ValidBitsPerSample;
     int32_t ChannelMask;
-    unsigned short SubFormat;
+    uint16_t SubFormat;
     char GUID [14];
 } WaveHeader;
 
@@ -135,13 +134,43 @@ typedef struct {
 typedef struct {
     char ckID [4];
     uint32_t ckSize;
-    short version;
-    unsigned char track_no, index_no;
+    int16_t version;
+    unsigned char block_index_u8;
+    unsigned char total_samples_u8;
     uint32_t total_samples, block_index, block_samples, flags, crc;
 } WavpackHeader;
 
 #define WavpackHeaderFormat "4LS2LLLLL"
 
+// Macros to access the 40-bit block_index field
+
+#define GET_BLOCK_INDEX(hdr) ( (int64_t) (hdr).block_index + ((int64_t) (hdr).block_index_u8 << 32) )
+
+#define SET_BLOCK_INDEX(hdr,value) do { \
+    int64_t tmp = (value);              \
+    (hdr).block_index = (uint32_t) tmp; \
+    (hdr).block_index_u8 =              \
+        (unsigned char) (tmp >> 32);    \
+} while (0)
+
+// Macros to access the 40-bit total_samples field, which is complicated by the fact that
+// all 1's in the lower 32 bits indicates "unknown" (regardless of upper 8 bits)
+
+#define GET_TOTAL_SAMPLES(hdr) ( ((hdr).total_samples == (uint32_t) -1) ? -1 : \
+    (int64_t) (hdr).total_samples + ((int64_t) (hdr).total_samples_u8 << 32) - (hdr).total_samples_u8 )
+
+#define SET_TOTAL_SAMPLES(hdr,value) do {       \
+    int64_t tmp = (value);                      \
+    if (tmp < 0)                                \
+        (hdr).total_samples = (uint32_t) -1;    \
+    else {                                      \
+        tmp += (tmp / (int64_t) 0xffffffff);    \
+        (hdr).total_samples = (uint32_t) tmp;   \
+        (hdr).total_samples_u8 =                \
+            (unsigned char) (tmp >> 32);        \
+    }                                           \
+} while (0)
+
 // or-values for "flags"
 
 #define BYTES_STORED    3       // 1-4 bytes/sample
@@ -169,17 +198,21 @@ typedef struct {
 #define SRATE_MASK      (0xfL << SRATE_LSB)
 
 #define FALSE_STEREO    0x40000000      // block is stereo, but data is mono
-
-#define IGNORED_FLAGS   0x18000000      // reserved, but ignore if encountered
 #define NEW_SHAPING     0x20000000      // use IIR filter for negative shaping
-#define UNKNOWN_FLAGS   0x80000000      // also reserved, but refuse decode if
-                                        //  encountered
 
 #define MONO_DATA (MONO_FLAG | FALSE_STEREO)
 
+// Introduced in WavPack 5.0:
+#define HAS_CHECKSUM    0x10000000      // block contains a trailing checksum
+#define DSD_FLAG        0x80000000      // block is encoded DSD (1-bit PCM)
+
+#define IGNORED_FLAGS   0x08000000      // reserved, but ignore if encountered
+#define UNKNOWN_FLAGS   0x00000000      // we no longer have any of these spares
+
 #define MIN_STREAM_VERS     0x402       // lowest stream version we'll decode
 #define MAX_STREAM_VERS     0x410       // highest stream version we'll decode or encode
-#define CUR_STREAM_VERS     0x407       // stream version we are [normally] writing now
+                                        // (only stream version to support mono optimization)
+#define CUR_STREAM_VERS     0x407       // universally compatible stream version
 
 
 //////////////////////////// WavPack Metadata /////////////////////////////////
@@ -211,14 +244,20 @@ typedef struct {
 #define ID_WVC_BITSTREAM        0xb
 #define ID_WVX_BITSTREAM        0xc
 #define ID_CHANNEL_INFO         0xd
+#define ID_DSD_BLOCK            0xe
 
 #define ID_RIFF_HEADER          (ID_OPTIONAL_DATA | 0x1)
 #define ID_RIFF_TRAILER         (ID_OPTIONAL_DATA | 0x2)
-#define ID_REPLAY_GAIN          (ID_OPTIONAL_DATA | 0x3)
-#define ID_CUESHEET             (ID_OPTIONAL_DATA | 0x4)
+#define ID_ALT_HEADER           (ID_OPTIONAL_DATA | 0x3)
+#define ID_ALT_TRAILER          (ID_OPTIONAL_DATA | 0x4)
 #define ID_CONFIG_BLOCK         (ID_OPTIONAL_DATA | 0x5)
 #define ID_MD5_CHECKSUM         (ID_OPTIONAL_DATA | 0x6)
 #define ID_SAMPLE_RATE          (ID_OPTIONAL_DATA | 0x7)
+#define ID_ALT_EXTENSION        (ID_OPTIONAL_DATA | 0x8)
+#define ID_ALT_MD5_CHECKSUM     (ID_OPTIONAL_DATA | 0x9)
+#define ID_NEW_CONFIG_BLOCK     (ID_OPTIONAL_DATA | 0xa)
+#define ID_CHANNEL_IDENTITIES   (ID_OPTIONAL_DATA | 0xb)
+#define ID_BLOCK_CHECKSUM       (ID_OPTIONAL_DATA | 0xf)
 
 ///////////////////////// WavPack Configuration ///////////////////////////////
 
@@ -255,6 +294,7 @@ typedef struct {
 #define CONFIG_CREATE_EXE       0x40000 // create executable
 #define CONFIG_CREATE_WVC       0x80000 // create correction file
 #define CONFIG_OPTIMIZE_WVC     0x100000 // maximize bybrid compression
+#define CONFIG_COMPATIBLE_WRITE 0x400000 // write files for decoders < 4.3
 #define CONFIG_CALC_NOISE       0x800000 // calc noise in hybrid mode
 #define CONFIG_LOSSY_MODE       0x1000000 // obsolete (for information)
 #define CONFIG_EXTRA_MODE       0x2000000 // extra processing mode
@@ -264,6 +304,8 @@ typedef struct {
 #define CONFIG_PAIR_UNDEF_CHANS 0x20000000 // encode undefined channels in stereo pairs
 #define CONFIG_OPTIMIZE_MONO    0x80000000 // optimize for mono streams posing as stereo
 
+#define QMODE_DSD_AUDIO         0x30    // if either of these is set in qmode (version 5.0)
+
 /*
  * These config flags were never actually used, or are no longer used, or are
  * used for something else now. They may be used in the future for what they
@@ -305,7 +347,7 @@ typedef struct {
 
 typedef struct bs {
 #ifdef BITSTREAM_SHORTS
-    unsigned short *buf, *end, *ptr;
+    uint16_t *buf, *end, *ptr;
 #else
     unsigned char *buf, *end, *ptr;
 #endif
@@ -320,8 +362,10 @@ typedef struct bs {
 #define MAX_NTERMS 16
 #define MAX_TERM 8
 
+// Note that this structure is directly accessed in assembly files, so modify with care
+
 struct decorr_pass {
-    int term, delta, weight_A, weight_B;
+    int32_t term, delta, weight_A, weight_B;
     int32_t samples_A [MAX_TERM], samples_B [MAX_TERM];
     int32_t aweight_A, aweight_B;
     int32_t sum_A, sum_B;
@@ -342,6 +386,10 @@ struct words_data {
     struct entropy_data c [2];
 };
 
+typedef struct {
+    int32_t value, filter0, filter1, filter2, filter3, filter4, filter5, filter6, factor, byte;
+} DSDfilters;
+
 typedef struct {
     WavpackHeader wphdr;
     struct words_data w;
@@ -350,9 +398,10 @@ typedef struct {
     unsigned char *block2buff, *block2end;
     int32_t *sample_buffer;
 
+    int64_t sample_index;
     int bits, num_terms, mute_error, joint_stereo, false_stereo, shift;
     int num_decorrs, num_passes, best_decorr, mask_decorr;
-    uint32_t sample_index, crc, crc_x, crc_wvx;
+    uint32_t crc, crc_x, crc_wvx;
     Bitstream wvbits, wvcbits, wvxbits;
     int init_done, wvc_skip;
     float delta_decay;
@@ -363,12 +412,22 @@ typedef struct {
     struct {
         int32_t shaping_acc [2], shaping_delta [2], error [2];
         double noise_sum, noise_ave, noise_max;
-        short *shaping_data, *shaping_array;
+        int16_t *shaping_data, *shaping_array;
         int32_t shaping_samples;
     } dc;
 
     struct decorr_pass decorr_passes [MAX_NTERMS], analysis_pass;
     const WavpackDecorrSpec *decorr_specs;
+
+    struct {
+        unsigned char *byteptr, *endptr, (*probabilities) [256], **value_lookup, mode, ready;
+        int history_bins, p0, p1;
+        int16_t (*summed_probabilities) [256];
+        uint32_t low, high, value;
+        DSDfilters filters [2];
+        int32_t *ptable;
+    } dsd;
+
 } WavpackStream;
 
 // flags for float_flags:
@@ -399,6 +458,22 @@ typedef struct {
     int32_t (*write_bytes)(void *id, void *data, int32_t bcount);
 } WavpackStreamReader;
 
+// Extended version of structure for handling large files and added
+// functionality for truncating and closing files
+
+typedef struct {
+    int32_t (*read_bytes)(void *id, void *data, int32_t bcount);
+    int32_t (*write_bytes)(void *id, void *data, int32_t bcount);
+    int64_t (*get_pos)(void *id);                               // new signature for large files
+    int (*set_pos_abs)(void *id, int64_t pos);                  // new signature for large files
+    int (*set_pos_rel)(void *id, int64_t delta, int mode);      // new signature for large files
+    int (*push_back_byte)(void *id, int c);
+    int64_t (*get_length)(void *id);                            // new signature for large files
+    int (*can_seek)(void *id);
+    int (*truncate_here)(void *id);                             // new function to truncate file at current position
+    int (*close)(void *id);                                     // new function to close file
+} WavpackStreamReader64;
+
 typedef int (*WavpackBlockOutput)(void *id, void *data, int32_t bcount);
 
 typedef struct {
@@ -414,12 +489,13 @@ typedef struct {
     WavpackBlockOutput blockout;
     void *wv_out, *wvc_out;
 
-    WavpackStreamReader *reader;
+    WavpackStreamReader64 *reader;
     void *wv_in, *wvc_in;
 
-    uint32_t filelen, file2len, filepos, file2pos, total_samples, crc_errors, first_flags;
-    int wvc_flag, open_flags, norm_offset, reduced_channels, lossy_blocks, close_files;
-    uint32_t block_samples, ave_block_samples, block_boundary, max_samples, acc_samples, initial_index, riff_trailer_bytes;
+    int64_t filelen, file2len, filepos, file2pos, total_samples, initial_index;
+    uint32_t crc_errors, first_flags;
+    int wvc_flag, open_flags, norm_offset, reduced_channels, lossy_blocks, version_five;
+    uint32_t block_samples, ave_block_samples, block_boundary, max_samples, acc_samples, riff_trailer_bytes;
     int riff_header_added, riff_header_created;
     M_Tag m_tag;
 
@@ -427,6 +503,13 @@ typedef struct {
     WavpackStream **streams;
     void *stream3;
 
+    // these items were added in 5.0 to support alternate file types (especially CAF & DSD)
+    unsigned char file_format, *channel_reordering, *channel_identities;
+    uint32_t channel_layout, dsd_multiplier;
+    void *decimation_context;
+    char file_extension [8];
+
+    void (*close_callback)(void *wpc);
     char error_message [80];
 } WavpackContext;
 
@@ -434,6 +517,11 @@ typedef struct {
 
 #define CLEAR(destin) memset (&destin, 0, sizeof (destin));
 
+//////////////////////////////// decorrelation //////////////////////////////
+// modules: pack.c, unpack.c, unpack_floats.c, extra1.c, extra2.c
+
+// #define SKIP_DECORRELATION   // experimental switch to disable all decorrelation on encode
+
 // These macros implement the weight application and update operations
 // that are at the heart of the decorrelation loops. Note that there are
 // sometimes two and even three versions of each macro. Theses should be
@@ -449,15 +537,17 @@ typedef struct {
 #if 1   // PERFCOND - apply decorrelation weight when 32-bit overflow is possible
 #define apply_weight_f(weight, sample) (((((sample & 0xffff) * weight) >> 9) + \
     (((sample & ~0xffff) >> 9) * weight) + 1) >> 1)
+#elif 1
+#define apply_weight_f(weight, sample) ((int32_t)((weight * (int64_t) sample + 512) >> 10))
 #else
 #define apply_weight_f(weight, sample) ((int32_t)floor(((double) weight * sample + 512.0) / 1024.0))
 #endif
 
-#if 1   // PERFCOND - universal version that checks input magnitude (or simply uses 64-bit ints)
-#define apply_weight(weight, sample) (sample != (short) sample ? \
+#if 1   // PERFCOND - universal version that checks input magnitude or always uses long version
+#define apply_weight(weight, sample) (sample != (int16_t) sample ? \
     apply_weight_f (weight, sample) : apply_weight_i (weight, sample))
 #else
-#define apply_weight(weight, sample) ((int32_t)((weight * (int64_t) sample + 512) >> 10))
+#define apply_weight(weight, sample) (apply_weight_f (weight, sample))
 #endif
 
 #if 1   // PERFCOND
@@ -471,9 +561,6 @@ typedef struct {
     if (source && result) (source ^ result) < 0 ? (weight -= delta) : (weight += delta);
 #endif
 
-#define update_weight_d2(weight, delta, source, result) \
-    if (source && result) weight -= (((source ^ result) >> 29) & 4) - 2;
-
 #define update_weight_clip(weight, delta, source, result) \
     if (source && result) { \
         const int32_t s = (source ^ result) >> 31; \
@@ -481,29 +568,59 @@ typedef struct {
         weight = (weight ^ s) - s; \
     }
 
-#define update_weight_clip_d2(weight, delta, source, result) \
-    if (source && result) { \
-        const int32_t s = (source ^ result) >> 31; \
-        if ((weight = (weight ^ s) + (2 - s)) > 1024) weight = 1024; \
-        weight = (weight ^ s) - s; \
-    }
+void pack_init (WavpackContext *wpc);
+int pack_block (WavpackContext *wpc, int32_t *buffer);
+void send_general_metadata (WavpackContext *wpc);
+void free_metadata (WavpackMetadata *wpmd);
+int copy_metadata (WavpackMetadata *wpmd, unsigned char *buffer_start, unsigned char *buffer_end);
+double WavpackGetEncodedNoise (WavpackContext *wpc, double *peak);
+int unpack_init (WavpackContext *wpc);
+int read_decorr_terms (WavpackStream *wps, WavpackMetadata *wpmd);
+int read_decorr_weights (WavpackStream *wps, WavpackMetadata *wpmd);
+int read_decorr_samples (WavpackStream *wps, WavpackMetadata *wpmd);
+int read_shaping_info (WavpackStream *wps, WavpackMetadata *wpmd);
+int32_t unpack_samples (WavpackContext *wpc, int32_t *buffer, uint32_t sample_count);
+int check_crc_error (WavpackContext *wpc);
+int scan_float_data (WavpackStream *wps, f32 *values, int32_t num_values);
+void send_float_data (WavpackStream *wps, f32 *values, int32_t num_values);
+void float_values (WavpackStream *wps, int32_t *values, int32_t num_values);
+void dynamic_noise_shaping (WavpackContext *wpc, int32_t *buffer, int shortening_allowed);
+void execute_stereo (WavpackContext *wpc, int32_t *samples, int no_history, int do_samples);
+void execute_mono (WavpackContext *wpc, int32_t *samples, int no_history, int do_samples);
 
-// bits.c
+////////////////////////// DSD related (including decimation) //////////////////////////
+// modules: pack_dsd.c unpack_dsd.c
 
-void bs_open_read (Bitstream *bs, void *buffer_start, void *buffer_end);
-void bs_open_write (Bitstream *bs, void *buffer_start, void *buffer_end);
-uint32_t bs_close_read (Bitstream *bs);
-uint32_t bs_close_write (Bitstream *bs);
+void pack_dsd_init (WavpackContext *wpc);
+int pack_dsd_block (WavpackContext *wpc, int32_t *buffer);
+int init_dsd_block (WavpackContext *wpc, WavpackMetadata *wpmd);
+int32_t unpack_dsd_samples (WavpackContext *wpc, int32_t *buffer, uint32_t sample_count);
 
-int DoReadFile (FILE *hFile, void *lpBuffer, uint32_t nNumberOfBytesToRead, uint32_t *lpNumberOfBytesRead);
-int DoWriteFile (FILE *hFile, void *lpBuffer, uint32_t nNumberOfBytesToWrite, uint32_t *lpNumberOfBytesWritten);
-uint32_t DoGetFileSize (FILE *hFile), DoGetFilePosition (FILE *hFile);
-int DoSetFilePositionRelative (FILE *hFile, int32_t pos, int mode);
-int DoSetFilePositionAbsolute (FILE *hFile, uint32_t pos);
-int DoUngetc (int c, FILE *hFile), DoDeleteFile (char *filename);
-int DoCloseHandle (FILE *hFile), DoTruncateFile (FILE *hFile);
+void *decimate_dsd_init (int num_channels);
+void decimate_dsd_reset (void *decimate_context);
+void decimate_dsd_run (void *decimate_context, int32_t *samples, int num_samples);
+void decimate_dsd_destroy (void *decimate_context);
+
+///////////////////////////////// CPU feature detection ////////////////////////////////
+
+int unpack_cpu_has_feature_x86 (int findex), pack_cpu_has_feature_x86 (int findex);
+
+#define CPU_FEATURE_MMX     23
+
+///////////////////////////// pre-4.0 version decoding ////////////////////////////
+// modules: unpack3.c, unpack3_open.c, unpack3_seek.c
+
+WavpackContext *open_file3 (WavpackContext *wpc, char *error);
+int32_t unpack_samples3 (WavpackContext *wpc, int32_t *buffer, uint32_t sample_count);
+int seek_sample3 (WavpackContext *wpc, uint32_t desired_index);
+uint32_t get_sample_index3 (WavpackContext *wpc);
+void free_stream3 (WavpackContext *wpc);
+int get_version3 (WavpackContext *wpc);
+
+////////////////////////////// bitstream macros & functions /////////////////////////////
 
 #define bs_is_open(bs) ((bs)->ptr != NULL)
+uint32_t bs_close_read (Bitstream *bs);
 
 #define getbit(bs) ( \
     (((bs)->bc) ? \
@@ -564,56 +681,51 @@ int DoCloseHandle (FILE *hFile), DoTruncateFile (FILE *hFile);
         } while ((bs)->bc >= sizeof (*((bs)->ptr)) * 8); \
 } while (0)
 
-void little_endian_to_native (void *data, char *format);
-void native_to_little_endian (void *data, char *format);
+///////////////////////////// entropy encoder / decoder ////////////////////////////
+// modules: entropy_utils.c, read_words.c, write_words.c
 
-// pack.c
+// these control the time constant "slow_level" which is used for hybrid mode
+// that controls bitrate as a function of residual level (HYBRID_BITRATE).
+#define SLS 8
+#define SLO ((1 << (SLS - 1)))
 
-void pack_init (WavpackContext *wpc);
-int pack_block (WavpackContext *wpc, int32_t *buffer);
-double WavpackGetEncodedNoise (WavpackContext *wpc, double *peak);
+#define LIMIT_ONES 16   // maximum consecutive 1s sent for "div" data
 
-// unpack.c
+// these control the time constant of the 3 median level breakpoints
+#define DIV0 128        // 5/7 of samples
+#define DIV1 64         // 10/49 of samples
+#define DIV2 32         // 20/343 of samples
 
-int unpack_init (WavpackContext *wpc);
-int init_wv_bitstream (WavpackStream *wps, WavpackMetadata *wpmd);
-int init_wvc_bitstream (WavpackStream *wps, WavpackMetadata *wpmd);
-int init_wvx_bitstream (WavpackStream *wps, WavpackMetadata *wpmd);
-int read_decorr_terms (WavpackStream *wps, WavpackMetadata *wpmd);
-int read_decorr_weights (WavpackStream *wps, WavpackMetadata *wpmd);
-int read_decorr_samples (WavpackStream *wps, WavpackMetadata *wpmd);
-int read_shaping_info (WavpackStream *wps, WavpackMetadata *wpmd);
-int read_float_info (WavpackStream *wps, WavpackMetadata *wpmd);
-int read_int32_info (WavpackStream *wps, WavpackMetadata *wpmd);
-int read_channel_info (WavpackContext *wpc, WavpackMetadata *wpmd);
-int read_config_info (WavpackContext *wpc, WavpackMetadata *wpmd);
-int read_sample_rate (WavpackContext *wpc, WavpackMetadata *wpmd);
-int read_wrapper_data (WavpackContext *wpc, WavpackMetadata *wpmd);
-int32_t unpack_samples (WavpackContext *wpc, int32_t *buffer, uint32_t sample_count);
-int check_crc_error (WavpackContext *wpc);
+// this macro retrieves the specified median breakpoint (without frac; min = 1)
+#define GET_MED(med) (((c->median [med]) >> 4) + 1)
 
-// unpack3.c
+// These macros update the specified median breakpoints. Note that the median
+// is incremented when the sample is higher than the median, else decremented.
+// They are designed so that the median will never drop below 1 and the value
+// is essentially stationary if there are 2 increments for every 5 decrements.
 
-WavpackContext *open_file3 (WavpackContext *wpc, char *error);
-int32_t unpack_samples3 (WavpackContext *wpc, int32_t *buffer, uint32_t sample_count);
-int seek_sample3 (WavpackContext *wpc, uint32_t desired_index);
-uint32_t get_sample_index3 (WavpackContext *wpc);
-void free_stream3 (WavpackContext *wpc);
-int get_version3 (WavpackContext *wpc);
+#define INC_MED0() (c->median [0] += ((c->median [0] + DIV0) / DIV0) * 5)
+#define DEC_MED0() (c->median [0] -= ((c->median [0] + (DIV0-2)) / DIV0) * 2)
+#define INC_MED1() (c->median [1] += ((c->median [1] + DIV1) / DIV1) * 5)
+#define DEC_MED1() (c->median [1] -= ((c->median [1] + (DIV1-2)) / DIV1) * 2)
+#define INC_MED2() (c->median [2] += ((c->median [2] + DIV2) / DIV2) * 5)
+#define DEC_MED2() (c->median [2] -= ((c->median [2] + (DIV2-2)) / DIV2) * 2)
 
-// metadata.c stuff
-
-int read_metadata_buff (WavpackMetadata *wpmd, unsigned char *blockbuff, unsigned char **buffptr);
-int write_metadata_block (WavpackContext *wpc);
-int copy_metadata (WavpackMetadata *wpmd, unsigned char *buffer_start, unsigned char *buffer_end);
-int add_to_metadata (WavpackContext *wpc, void *data, uint32_t bcount, unsigned char id);
-int process_metadata (WavpackContext *wpc, WavpackMetadata *wpmd);
-void free_metadata (WavpackMetadata *wpmd);
-
-// words.c stuff
+#ifdef HAVE___BUILTIN_CLZ
+#define count_bits(av) ((av) ? 32 - __builtin_clz (av) : 0)
+#elif defined (_WIN64)
+static __inline int count_bits (uint32_t av) { unsigned long res; return _BitScanReverse (&res, av) ? (int)(res + 1) : 0; }
+#else
+#define count_bits(av) ( \
+ (av) < (1 << 8) ? nbits_table [av] : \
+  ( \
+   (av) < (1L << 16) ? nbits_table [(av) >> 8] + 8 : \
+   ((av) < (1L << 24) ? nbits_table [(av) >> 16] + 16 : nbits_table [(av) >> 24] + 24) \
+  ) \
+)
+#endif
 
 void init_words (WavpackStream *wps);
-void word_set_bitrate (WavpackStream *wps);
 void write_entropy_vars (WavpackStream *wps, WavpackMetadata *wpmd);
 void write_hybrid_profile (WavpackStream *wps, WavpackMetadata *wpmd);
 int read_entropy_vars (WavpackStream *wps, WavpackMetadata *wpmd);
@@ -625,34 +737,39 @@ int32_t get_words_lossless (WavpackStream *wps, int32_t *buffer, int32_t nsample
 void flush_word (WavpackStream *wps);
 int32_t nosend_word (WavpackStream *wps, int32_t value, int chan);
 void scan_word (WavpackStream *wps, int32_t *samples, uint32_t num_samples, int dir);
+void update_error_limit (WavpackStream *wps);
 
-int log2s (int32_t value);
-int32_t exp2s (int log);
-uint32_t log2buffer (int32_t *samples, uint32_t num_samples, int limit);
+extern const uint32_t bitset [32];
+extern const uint32_t bitmask [32];
+extern const char nbits_table [256];
+
+int wp_log2s (int32_t value);
+int32_t wp_exp2s (int log);
+int FASTCALL wp_log2 (uint32_t avalue);
+
+#ifdef OPT_ASM_X86
+#define LOG2BUFFER log2buffer_x86
+#elif defined(OPT_ASM_X64) && (defined (_WIN64) || defined(__CYGWIN__) || defined(__MINGW64__))
+#define LOG2BUFFER log2buffer_x64win
+#elif defined(OPT_ASM_X64)
+#define LOG2BUFFER log2buffer_x64
+#else
+#define LOG2BUFFER log2buffer
+#endif
+
+uint32_t LOG2BUFFER (int32_t *samples, uint32_t num_samples, int limit);
 
 signed char store_weight (int weight);
 int restore_weight (signed char weight);
 
 #define WORD_EOF ((int32_t)(1L << 31))
 
-// float.c
-
-void write_float_info (WavpackStream *wps, WavpackMetadata *wpmd);
-int scan_float_data (WavpackStream *wps, f32 *values, int32_t num_values);
-void send_float_data (WavpackStream *wps, f32 *values, int32_t num_values);
-int read_float_info (WavpackStream *wps, WavpackMetadata *wpmd);
-void float_values (WavpackStream *wps, int32_t *values, int32_t num_values);
 void WavpackFloatNormalize (int32_t *values, int32_t num_values, int delta_exp);
 
-// extra?.c
-
-// void analyze_stereo (WavpackContext *wpc, int32_t *samples);
-// void analyze_mono (WavpackContext *wpc, int32_t *samples);
-void execute_stereo (WavpackContext *wpc, int32_t *samples, int no_history, int do_samples);
-void execute_mono (WavpackContext *wpc, int32_t *samples, int no_history, int do_samples);
-
-// wputils.c
+/////////////////////////// high-level unpacking API and support ////////////////////////////
+// modules: open_utils.c, unpack_utils.c, unpack_seek.c, unpack_floats.c
 
+WavpackContext *WavpackOpenFileInputEx64 (WavpackStreamReader64 *reader, void *wv_id, void *wvc_id, char *error, int flags, int norm_offset);
 WavpackContext *WavpackOpenFileInputEx (WavpackStreamReader *reader, void *wv_id, void *wvc_id, char *error, int flags, int norm_offset);
 WavpackContext *WavpackOpenFileInput (const char *infilename, char *error, int flags, int norm_offset);
 
@@ -664,6 +781,16 @@ WavpackContext *WavpackOpenFileInput (const char *infilename, char *error, int f
 #define OPEN_STREAMING  0x20    // "streaming" mode blindly unpacks blocks
                                 // w/o regard to header file position info
 #define OPEN_EDIT_TAGS  0x40    // allow editing of tags
+#define OPEN_FILE_UTF8  0x80    // assume filenames are UTF-8 encoded, not ANSI (Windows only)
+
+// new for version 5
+
+#define OPEN_DSD_NATIVE 0x100   // open DSD files as bitstreams
+                                // (returned as 8-bit "samples" stored in 32-bit words)
+#define OPEN_DSD_AS_PCM 0x200   // open DSD files as 24-bit PCM (decimated 8x)
+#define OPEN_ALT_TYPES  0x400   // application is aware of alternate file types & qmode
+                                // (just affects retrieving wrappers & MD5 checksums)
+#define OPEN_NO_CHECKSUM 0x800  // don't verify block checksums before decoding
 
 int WavpackGetMode (WavpackContext *wpc);
 
@@ -682,15 +809,38 @@ int WavpackGetMode (WavpackContext *wpc);
 #define MODE_XMODE      0x7000  // mask for extra level (1-6, 0=unknown)
 #define MODE_DNS        0x8000
 
-char *WavpackGetErrorMessage (WavpackContext *wpc);
+int WavpackGetQualifyMode (WavpackContext *wpc);
 int WavpackGetVersion (WavpackContext *wpc);
 uint32_t WavpackUnpackSamples (WavpackContext *wpc, int32_t *buffer, uint32_t samples);
-uint32_t WavpackGetNumSamples (WavpackContext *wpc);
-uint32_t WavpackGetSampleIndex (WavpackContext *wpc);
-int WavpackGetNumErrors (WavpackContext *wpc);
-int WavpackLossyBlocks (WavpackContext *wpc);
 int WavpackSeekSample (WavpackContext *wpc, uint32_t sample);
-WavpackContext *WavpackCloseFile (WavpackContext *wpc);
+int WavpackSeekSample64 (WavpackContext *wpc, int64_t sample);
+int WavpackGetMD5Sum (WavpackContext *wpc, unsigned char data [16]);
+
+int WavpackVerifySingleBlock (unsigned char *buffer, int verify_checksum);
+uint32_t read_next_header (WavpackStreamReader64 *reader, void *id, WavpackHeader *wphdr);
+int read_wvc_block (WavpackContext *wpc);
+
+/////////////////////////// high-level packing API and support ////////////////////////////
+// modules: pack_utils.c, pack_floats.c
+
+WavpackContext *WavpackOpenFileOutput (WavpackBlockOutput blockout, void *wv_id, void *wvc_id);
+int WavpackSetConfiguration (WavpackContext *wpc, WavpackConfig *config, uint32_t total_samples);
+int WavpackSetConfiguration64 (WavpackContext *wpc, WavpackConfig *config, int64_t total_samples, const unsigned char *chan_ids);
+int WavpackPackInit (WavpackContext *wpc);
+int WavpackAddWrapper (WavpackContext *wpc, void *data, uint32_t bcount);
+int WavpackPackSamples (WavpackContext *wpc, int32_t *sample_buffer, uint32_t sample_count);
+int WavpackFlushSamples (WavpackContext *wpc);
+int WavpackStoreMD5Sum (WavpackContext *wpc, unsigned char data [16]);
+void WavpackSeekTrailingWrapper (WavpackContext *wpc);
+void WavpackUpdateNumSamples (WavpackContext *wpc, void *first_block);
+void *WavpackGetWrapperLocation (void *first_block, uint32_t *size);
+
+/////////////////////////////////// common utilities ////////////////////////////////////
+// module: common_utils.c
+
+extern const uint32_t sample_rates [16];
+uint32_t WavpackGetLibraryVersion (void);
+const char *WavpackGetLibraryVersionString (void);
 uint32_t WavpackGetSampleRate (WavpackContext *wpc);
 int WavpackGetBitsPerSample (WavpackContext *wpc);
 int WavpackGetBytesPerSample (WavpackContext *wpc);
@@ -698,34 +848,33 @@ int WavpackGetNumChannels (WavpackContext *wpc);
 int WavpackGetChannelMask (WavpackContext *wpc);
 int WavpackGetReducedChannels (WavpackContext *wpc);
 int WavpackGetFloatNormExp (WavpackContext *wpc);
-int WavpackGetMD5Sum (WavpackContext *wpc, unsigned char data [16]);
+uint32_t WavpackGetNumSamples (WavpackContext *wpc);
+int64_t WavpackGetNumSamples64 (WavpackContext *wpc);
+uint32_t WavpackGetSampleIndex (WavpackContext *wpc);
+int64_t WavpackGetSampleIndex64 (WavpackContext *wpc);
+char *WavpackGetErrorMessage (WavpackContext *wpc);
+int WavpackGetNumErrors (WavpackContext *wpc);
+int WavpackLossyBlocks (WavpackContext *wpc);
 uint32_t WavpackGetWrapperBytes (WavpackContext *wpc);
 unsigned char *WavpackGetWrapperData (WavpackContext *wpc);
 void WavpackFreeWrapper (WavpackContext *wpc);
-void WavpackSeekTrailingWrapper (WavpackContext *wpc);
 double WavpackGetProgress (WavpackContext *wpc);
 uint32_t WavpackGetFileSize (WavpackContext *wpc);
+int64_t WavpackGetFileSize64 (WavpackContext *wpc);
 double WavpackGetRatio (WavpackContext *wpc);
 double WavpackGetAverageBitrate (WavpackContext *wpc, int count_wvc);
 double WavpackGetInstantBitrate (WavpackContext *wpc);
-
-WavpackContext *WavpackOpenFileOutput (WavpackBlockOutput blockout, void *wv_id, void *wvc_id);
-int WavpackSetConfiguration (WavpackContext *wpc, WavpackConfig *config, uint32_t total_samples);
-int WavpackAddWrapper (WavpackContext *wpc, void *data, uint32_t bcount);
-int WavpackStoreMD5Sum (WavpackContext *wpc, unsigned char data [16]);
-int WavpackPackInit (WavpackContext *wpc);
-int WavpackPackSamples (WavpackContext *wpc, int32_t *sample_buffer, uint32_t sample_count);
-int WavpackFlushSamples (WavpackContext *wpc);
-void WavpackUpdateNumSamples (WavpackContext *wpc, void *first_block);
-void *WavpackGetWrapperLocation (void *first_block, uint32_t *size);
-
+WavpackContext *WavpackCloseFile (WavpackContext *wpc);
 void WavpackLittleEndianToNative (void *data, char *format);
 void WavpackNativeToLittleEndian (void *data, char *format);
+void WavpackBigEndianToNative (void *data, char *format);
+void WavpackNativeToBigEndian (void *data, char *format);
 
-uint32_t WavpackGetLibraryVersion (void);
-const char *WavpackGetLibraryVersionString (void);
+void install_close_callback (WavpackContext *wpc, void cb_func (void *wpc));
+void free_streams (WavpackContext *wpc);
 
-// tags.c
+/////////////////////////////////// tag utilities ////////////////////////////////////
+// modules: tags.c, tag_utils.c
 
 int WavpackGetNumTagItems (WavpackContext *wpc);
 int WavpackGetTagItem (WavpackContext *wpc, const char *item, char *value, int size);
@@ -742,58 +891,5 @@ void free_tag (M_Tag *m_tag);
 int valid_tag (M_Tag *m_tag);
 int editable_tag (M_Tag *m_tag);
 
-///////////////////////////// SIMD helper macros /////////////////////////////
-
-#ifdef OPT_MMX
-
-#if defined (__GNUC__) && !defined (__INTEL_COMPILER)
-//directly map to gcc's native builtins for faster code
-
-#if __GNUC__ < 4
-typedef int __di __attribute__ ((__mode__ (__DI__)));
-typedef int __m64 __attribute__ ((__mode__ (__V2SI__)));
-typedef int __v4hi __attribute__ ((__mode__ (__V4HI__)));
-#define _m_paddsw(m1, m2) (__m64) __builtin_ia32_paddsw ((__v4hi) m1, (__v4hi) m2)
-#define _m_pand(m1, m2) (__m64) __builtin_ia32_pand ((__di) m1, (__di) m2)
-#define _m_pandn(m1, m2) (__m64) __builtin_ia32_pandn ((__di) m1, (__di) m2)
-#define _m_pmaddwd(m1, m2) __builtin_ia32_pmaddwd ((__v4hi) m1, (__v4hi) m2)
-#define _m_por(m1, m2) (__m64) __builtin_ia32_por ((__di) m1, (__di) m2)
-#define _m_pxor(m1, m2) (__m64) __builtin_ia32_pxor ((__di) m1, (__di) m2)
-#else
-typedef int __m64 __attribute__ ((__vector_size__ (8)));
-typedef short __m64_16 __attribute__ ((__vector_size__ (8)));
-#define _m_paddsw(m1, m2) (__m64) __builtin_ia32_paddsw ((__m64_16) m1, (__m64_16) m2)
-#define _m_pand(m1, m2) __builtin_ia32_pand (m1, m2)
-#define _m_pandn(m1, m2) __builtin_ia32_pandn (m1, m2)
-#define _m_pmaddwd(m1, m2) __builtin_ia32_pmaddwd ((__m64_16) m1, (__m64_16) m2)
-#define _m_por(m1, m2) __builtin_ia32_por (m1, m2)
-#define _m_pxor(m1, m2) __builtin_ia32_pxor (m1, m2)
 #endif
 
-#define _m_paddd(m1, m2) __builtin_ia32_paddd (m1, m2)
-#define _m_pcmpeqd(m1, m2) __builtin_ia32_pcmpeqd (m1, m2)
-
-#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 4) || __GNUC__ > 4 || __has_builtin(__builtin_ia32_pslldi)
-#	define _m_pslldi(m1, m2) __builtin_ia32_pslldi ((__m64)m1, m2)
-#	define _m_psradi(m1, m2) __builtin_ia32_psradi ((__m64)m1, m2)
-#	define _m_psrldi(m1, m2) __builtin_ia32_psrldi ((__m64)m1, m2)
-#else
-#	define _m_pslldi(m1, m2) __builtin_ia32_pslld (m1, m2)
-#	define _m_psradi(m1, m2) __builtin_ia32_psrad (m1, m2)
-#	define _m_psrldi(m1, m2) __builtin_ia32_psrld (m1, m2)
-#endif
-
-#define _m_psubd(m1, m2) __builtin_ia32_psubd (m1, m2)
-#define _m_punpckhdq(m1, m2) __builtin_ia32_punpckhdq (m1, m2)
-#define _m_punpckldq(m1, m2) __builtin_ia32_punpckldq (m1, m2)
-#define _mm_empty() __builtin_ia32_emms ()
-#define _mm_set_pi32(m1, m2) { m2, m1 }
-#define _mm_set1_pi32(m) { m, m }
-
-#else
-#include <mmintrin.h>
-#endif
-
-#endif //OPT_MMX
-
-#endif
diff --git a/third_party/wavpack/src/wavpack_version.h b/third_party/wavpack/src/wavpack_version.h
index ed6e241..6acf274 100644
--- a/third_party/wavpack/src/wavpack_version.h
+++ b/third_party/wavpack/src/wavpack_version.h
@@ -11,9 +11,9 @@
 #ifndef WAVPACK_VERSION_H
 #define WAVPACK_VERSION_H
 
-#define LIBWAVPACK_MAJOR 4
-#define LIBWAVPACK_MINOR 70
+#define LIBWAVPACK_MAJOR 5
+#define LIBWAVPACK_MINOR 1
 #define LIBWAVPACK_MICRO 0
-#define LIBWAVPACK_VERSION_STRING "4.70.0"
+#define LIBWAVPACK_VERSION_STRING "5.1.0"
 
 #endif
diff --git a/third_party/wavpack/src/words.c b/third_party/wavpack/src/words.c
deleted file mode 100644
index 368b07a..0000000
--- a/third_party/wavpack/src/words.c
+++ /dev/null
@@ -1,1525 +0,0 @@
-////////////////////////////////////////////////////////////////////////////
-//                           **** WAVPACK ****                            //
-//                  Hybrid Lossless Wavefile Compressor                   //
-//              Copyright (c) 1998 - 2006 Conifer Software.               //
-//                          All Rights Reserved.                          //
-//      Distributed under the BSD Software License (see license.txt)      //
-////////////////////////////////////////////////////////////////////////////
-
-// words.c
-
-// This module provides entropy word encoding and decoding functions using
-// a variation on the Rice method.  This was introduced in version 3.93
-// because it allows splitting the data into a "lossy" stream and a
-// "correction" stream in a very efficient manner and is therefore ideal
-// for the "hybrid" mode.  For 4.0, the efficiency of this method was
-// significantly improved by moving away from the normal Rice restriction of
-// using powers of two for the modulus divisions and now the method can be
-// used for both hybrid and pure lossless encoding.
-
-// Samples are divided by median probabilities at 5/7 (71.43%), 10/49 (20.41%),
-// and 20/343 (5.83%). Each zone has 3.5 times fewer samples than the
-// previous. Using standard Rice coding on this data would result in 1.4
-// bits per sample average (not counting sign bit). However, there is a
-// very simple encoding that is over 99% efficient with this data and
-// results in about 1.22 bits per sample.
-
-#include "wavpack_local.h"
-
-#include <stdlib.h>
-#include <string.h>
-
-#ifdef DEBUG_ALLOC
-#define malloc malloc_db
-#define realloc realloc_db
-#define free free_db
-void *malloc_db (uint32_t size);
-void *realloc_db (void *ptr, uint32_t size);
-void free_db (void *ptr);
-int32_t dump_alloc (void);
-#endif
-
-//////////////////////////////// local macros /////////////////////////////////
-
-#define USE_NEXT8_OPTIMIZATION  // we normally want this, but code is easier to understand without it
-
-#define LIMIT_ONES 16   // maximum consecutive 1s sent for "div" data
-
-// these control the time constant "slow_level" which is used for hybrid mode
-// that controls bitrate as a function of residual level (HYBRID_BITRATE).
-#define SLS 8
-#define SLO ((1 << (SLS - 1)))
-
-// these control the time constant of the 3 median level breakpoints
-#define DIV0 128        // 5/7 of samples
-#define DIV1 64         // 10/49 of samples
-#define DIV2 32         // 20/343 of samples
-
-// this macro retrieves the specified median breakpoint (without frac; min = 1)
-#define GET_MED(med) (((c->median [med]) >> 4) + 1)
-
-// These macros update the specified median breakpoints. Note that the median
-// is incremented when the sample is higher than the median, else decremented.
-// They are designed so that the median will never drop below 1 and the value
-// is essentially stationary if there are 2 increments for every 5 decrements.
-
-#define INC_MED0() (c->median [0] += ((c->median [0] + DIV0) / DIV0) * 5)
-#define DEC_MED0() (c->median [0] -= ((c->median [0] + (DIV0-2)) / DIV0) * 2)
-#define INC_MED1() (c->median [1] += ((c->median [1] + DIV1) / DIV1) * 5)
-#define DEC_MED1() (c->median [1] -= ((c->median [1] + (DIV1-2)) / DIV1) * 2)
-#define INC_MED2() (c->median [2] += ((c->median [2] + DIV2) / DIV2) * 5)
-#define DEC_MED2() (c->median [2] -= ((c->median [2] + (DIV2-2)) / DIV2) * 2)
-
-#define count_bits(av) ( \
- (av) < (1 << 8) ? nbits_table [av] : \
-  ( \
-   (av) < (1L << 16) ? nbits_table [(av) >> 8] + 8 : \
-   ((av) < (1L << 24) ? nbits_table [(av) >> 16] + 16 : nbits_table [(av) >> 24] + 24) \
-  ) \
-)
-
-///////////////////////////// local table storage ////////////////////////////
-
-const uint32_t bitset [] = {
-    1L << 0, 1L << 1, 1L << 2, 1L << 3,
-    1L << 4, 1L << 5, 1L << 6, 1L << 7,
-    1L << 8, 1L << 9, 1L << 10, 1L << 11,
-    1L << 12, 1L << 13, 1L << 14, 1L << 15,
-    1L << 16, 1L << 17, 1L << 18, 1L << 19,
-    1L << 20, 1L << 21, 1L << 22, 1L << 23,
-    1L << 24, 1L << 25, 1L << 26, 1L << 27,
-    1L << 28, 1L << 29, 1L << 30, 1L << 31
-};
-
-const uint32_t bitmask [] = {
-    (1L << 0) - 1, (1L << 1) - 1, (1L << 2) - 1, (1L << 3) - 1,
-    (1L << 4) - 1, (1L << 5) - 1, (1L << 6) - 1, (1L << 7) - 1,
-    (1L << 8) - 1, (1L << 9) - 1, (1L << 10) - 1, (1L << 11) - 1,
-    (1L << 12) - 1, (1L << 13) - 1, (1L << 14) - 1, (1L << 15) - 1,
-    (1L << 16) - 1, (1L << 17) - 1, (1L << 18) - 1, (1L << 19) - 1,
-    (1L << 20) - 1, (1L << 21) - 1, (1L << 22) - 1, (1L << 23) - 1,
-    (1L << 24) - 1, (1L << 25) - 1, (1L << 26) - 1, (1L << 27) - 1,
-    (1L << 28) - 1, (1L << 29) - 1, (1L << 30) - 1, 0x7fffffff
-};
-
-const char nbits_table [] = {
-    0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,     // 0 - 15
-    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,     // 16 - 31
-    6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,     // 32 - 47
-    6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,     // 48 - 63
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,     // 64 - 79
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,     // 80 - 95
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,     // 96 - 111
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,     // 112 - 127
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,     // 128 - 143
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,     // 144 - 159
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,     // 160 - 175
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,     // 176 - 191
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,     // 192 - 207
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,     // 208 - 223
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,     // 224 - 239
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8      // 240 - 255
-};
-
-static const unsigned char log2_table [] = {
-    0x00, 0x01, 0x03, 0x04, 0x06, 0x07, 0x09, 0x0a, 0x0b, 0x0d, 0x0e, 0x10, 0x11, 0x12, 0x14, 0x15,
-    0x16, 0x18, 0x19, 0x1a, 0x1c, 0x1d, 0x1e, 0x20, 0x21, 0x22, 0x24, 0x25, 0x26, 0x28, 0x29, 0x2a,
-    0x2c, 0x2d, 0x2e, 0x2f, 0x31, 0x32, 0x33, 0x34, 0x36, 0x37, 0x38, 0x39, 0x3b, 0x3c, 0x3d, 0x3e,
-    0x3f, 0x41, 0x42, 0x43, 0x44, 0x45, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4d, 0x4e, 0x4f, 0x50, 0x51,
-    0x52, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63,
-    0x64, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x74, 0x75,
-    0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85,
-    0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95,
-    0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
-    0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb2,
-    0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc0,
-    0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcb, 0xcc, 0xcd, 0xce,
-    0xcf, 0xd0, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd8, 0xd9, 0xda, 0xdb,
-    0xdc, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe4, 0xe5, 0xe6, 0xe7, 0xe7,
-    0xe8, 0xe9, 0xea, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xee, 0xef, 0xf0, 0xf1, 0xf1, 0xf2, 0xf3, 0xf4,
-    0xf4, 0xf5, 0xf6, 0xf7, 0xf7, 0xf8, 0xf9, 0xf9, 0xfa, 0xfb, 0xfc, 0xfc, 0xfd, 0xfe, 0xff, 0xff
-};
-
-static const unsigned char exp2_table [] = {
-    0x00, 0x01, 0x01, 0x02, 0x03, 0x03, 0x04, 0x05, 0x06, 0x06, 0x07, 0x08, 0x08, 0x09, 0x0a, 0x0b,
-    0x0b, 0x0c, 0x0d, 0x0e, 0x0e, 0x0f, 0x10, 0x10, 0x11, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16, 0x16,
-    0x17, 0x18, 0x19, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1d, 0x1e, 0x1f, 0x20, 0x20, 0x21, 0x22, 0x23,
-    0x24, 0x24, 0x25, 0x26, 0x27, 0x28, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
-    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3a, 0x3b, 0x3c, 0x3d,
-    0x3e, 0x3f, 0x40, 0x41, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x48, 0x49, 0x4a, 0x4b,
-    0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a,
-    0x5b, 0x5c, 0x5d, 0x5e, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
-    0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
-    0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x87, 0x88, 0x89, 0x8a,
-    0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b,
-    0x9c, 0x9d, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad,
-    0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0,
-    0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc8, 0xc9, 0xca, 0xcb, 0xcd, 0xce, 0xcf, 0xd0, 0xd2, 0xd3, 0xd4,
-    0xd6, 0xd7, 0xd8, 0xd9, 0xdb, 0xdc, 0xdd, 0xde, 0xe0, 0xe1, 0xe2, 0xe4, 0xe5, 0xe6, 0xe8, 0xe9,
-    0xea, 0xec, 0xed, 0xee, 0xf0, 0xf1, 0xf2, 0xf4, 0xf5, 0xf6, 0xf8, 0xf9, 0xfa, 0xfc, 0xfd, 0xff
-};
-
-#ifdef USE_NEXT8_OPTIMIZATION
-static const char ones_count_table [] = {
-    0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,
-    0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,6,
-    0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,
-    0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,7,
-    0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,
-    0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,6,
-    0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,
-    0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,8
-};
-#endif
-
-///////////////////////////// executable code ////////////////////////////////
-
-static int FASTCALL mylog2 (uint32_t avalue);
-
-// Initialize entropy encoder for the specified stream. In lossless mode there
-// are no parameters to select; in hybrid mode the bitrate mode and value need
-// be initialized.
-
-#ifndef NO_PACK
-
-void init_words (WavpackStream *wps)
-{
-    CLEAR (wps->w);
-
-    if (wps->wphdr.flags & HYBRID_FLAG)
-        word_set_bitrate (wps);
-}
-
-// Set up parameters for hybrid mode based on header flags and "bits" field.
-// This is currently only set up for the HYBRID_BITRATE mode in which the
-// allowed error varies with the residual level (from "slow_level"). The
-// simpler mode (which is not used yet) has the error level directly
-// controlled from the metadata.
-
-void word_set_bitrate (WavpackStream *wps)
-{
-    int bitrate_0, bitrate_1;
-
-    if (wps->wphdr.flags & HYBRID_BITRATE) {
-        if (wps->wphdr.flags & FALSE_STEREO)
-            bitrate_0 = (wps->bits * 2 - 512) < 568 ? 0 : (wps->bits * 2 - 512) - 568;
-        else
-            bitrate_0 = wps->bits < 568 ? 0 : wps->bits - 568;
-
-        if (!(wps->wphdr.flags & MONO_DATA)) {
-
-            if (wps->wphdr.flags & HYBRID_BALANCE)
-                bitrate_1 = (wps->wphdr.flags & JOINT_STEREO) ? 256 : 0;
-            else {
-                bitrate_1 = bitrate_0;
-
-                if (wps->wphdr.flags & JOINT_STEREO) {
-                    if (bitrate_0 < 128) {
-                        bitrate_1 += bitrate_0;
-                        bitrate_0 = 0;
-                    }
-                    else {
-                        bitrate_0 -= 128;
-                        bitrate_1 += 128;
-                    }
-                }
-            }
-        }
-        else
-            bitrate_1 = 0;
-    }
-    else
-        bitrate_0 = bitrate_1 = 0;
-
-    wps->w.bitrate_acc [0] = (int32_t) bitrate_0 << 16;
-    wps->w.bitrate_acc [1] = (int32_t) bitrate_1 << 16;
-}
-
-// Allocates the correct space in the metadata structure and writes the
-// current median values to it. Values are converted from 32-bit unsigned
-// to our internal 16-bit mylog2 values, and read_entropy_vars () is called
-// to read the values back because we must compensate for the loss through
-// the log function.
-
-void write_entropy_vars (WavpackStream *wps, WavpackMetadata *wpmd)
-{
-    unsigned char *byteptr;
-    int temp;
-
-    byteptr = wpmd->data = malloc (12);
-    wpmd->id = ID_ENTROPY_VARS;
-
-    *byteptr++ = temp = mylog2 (wps->w.c [0].median [0]);
-    *byteptr++ = temp >> 8;
-    *byteptr++ = temp = mylog2 (wps->w.c [0].median [1]);
-    *byteptr++ = temp >> 8;
-    *byteptr++ = temp = mylog2 (wps->w.c [0].median [2]);
-    *byteptr++ = temp >> 8;
-
-    if (!(wps->wphdr.flags & MONO_DATA)) {
-        *byteptr++ = temp = mylog2 (wps->w.c [1].median [0]);
-        *byteptr++ = temp >> 8;
-        *byteptr++ = temp = mylog2 (wps->w.c [1].median [1]);
-        *byteptr++ = temp >> 8;
-        *byteptr++ = temp = mylog2 (wps->w.c [1].median [2]);
-        *byteptr++ = temp >> 8;
-    }
-
-    wpmd->byte_length = (int32_t)(byteptr - (unsigned char *) wpmd->data);
-    read_entropy_vars (wps, wpmd);
-}
-
-// Allocates enough space in the metadata structure and writes the current
-// high word of the bitrate accumulator and the slow_level values to it. The
-// slow_level values are converted from 32-bit unsigned to our internal 16-bit
-// mylog2 values. Afterward, read_entropy_vars () is called to read the values
-// back because we must compensate for the loss through the log function and
-// the truncation of the bitrate.
-
-void write_hybrid_profile (WavpackStream *wps, WavpackMetadata *wpmd)
-{
-    unsigned char *byteptr;
-    int temp;
-
-    word_set_bitrate (wps);
-    byteptr = wpmd->data = malloc (512);
-    wpmd->id = ID_HYBRID_PROFILE;
-
-    if (wps->wphdr.flags & HYBRID_BITRATE) {
-        *byteptr++ = temp = log2s (wps->w.c [0].slow_level);
-        *byteptr++ = temp >> 8;
-
-        if (!(wps->wphdr.flags & MONO_DATA)) {
-            *byteptr++ = temp = log2s (wps->w.c [1].slow_level);
-            *byteptr++ = temp >> 8;
-        }
-    }
-
-    *byteptr++ = temp = wps->w.bitrate_acc [0] >> 16;
-    *byteptr++ = temp >> 8;
-
-    if (!(wps->wphdr.flags & MONO_DATA)) {
-        *byteptr++ = temp = wps->w.bitrate_acc [1] >> 16;
-        *byteptr++ = temp >> 8;
-    }
-
-    if (wps->w.bitrate_delta [0] | wps->w.bitrate_delta [1]) {
-        *byteptr++ = temp = log2s (wps->w.bitrate_delta [0]);
-        *byteptr++ = temp >> 8;
-
-        if (!(wps->wphdr.flags & MONO_DATA)) {
-            *byteptr++ = temp = log2s (wps->w.bitrate_delta [1]);
-            *byteptr++ = temp >> 8;
-        }
-    }
-
-    wpmd->byte_length = (int32_t)(byteptr - (unsigned char *) wpmd->data);
-    read_hybrid_profile (wps, wpmd);
-}
-
-#endif
-
-// Read the median log2 values from the specifed metadata structure, convert
-// them back to 32-bit unsigned values and store them. If length is not
-// exactly correct then we flag and return an error.
-
-int read_entropy_vars (WavpackStream *wps, WavpackMetadata *wpmd)
-{
-    unsigned char *byteptr = wpmd->data;
-
-    if (wpmd->byte_length != ((wps->wphdr.flags & MONO_DATA) ? 6 : 12))
-        return FALSE;
-
-    wps->w.c [0].median [0] = exp2s (byteptr [0] + (byteptr [1] << 8));
-    wps->w.c [0].median [1] = exp2s (byteptr [2] + (byteptr [3] << 8));
-    wps->w.c [0].median [2] = exp2s (byteptr [4] + (byteptr [5] << 8));
-
-    if (!(wps->wphdr.flags & MONO_DATA)) {
-        wps->w.c [1].median [0] = exp2s (byteptr [6] + (byteptr [7] << 8));
-        wps->w.c [1].median [1] = exp2s (byteptr [8] + (byteptr [9] << 8));
-        wps->w.c [1].median [2] = exp2s (byteptr [10] + (byteptr [11] << 8));
-    }
-
-    return TRUE;
-}
-
-// Read the hybrid related values from the specifed metadata structure, convert
-// them back to their internal formats and store them. The extended profile
-// stuff is not implemented yet, so return an error if we get more data than
-// we know what to do with.
-
-int read_hybrid_profile (WavpackStream *wps, WavpackMetadata *wpmd)
-{
-    unsigned char *byteptr = wpmd->data;
-    unsigned char *endptr = byteptr + wpmd->byte_length;
-
-    if (wps->wphdr.flags & HYBRID_BITRATE) {
-        if (byteptr + (wps->wphdr.flags & MONO_DATA ? 2 : 4) > endptr)
-            return FALSE;
-
-        wps->w.c [0].slow_level = exp2s (byteptr [0] + (byteptr [1] << 8));
-        byteptr += 2;
-
-        if (!(wps->wphdr.flags & MONO_DATA)) {
-            wps->w.c [1].slow_level = exp2s (byteptr [0] + (byteptr [1] << 8));
-            byteptr += 2;
-        }
-    }
-
-    if (byteptr + (wps->wphdr.flags & MONO_DATA ? 2 : 4) > endptr)
-        return FALSE;
-
-    wps->w.bitrate_acc [0] = (int32_t)(byteptr [0] + (byteptr [1] << 8)) << 16;
-    byteptr += 2;
-
-    if (!(wps->wphdr.flags & MONO_DATA)) {
-        wps->w.bitrate_acc [1] = (int32_t)(byteptr [0] + (byteptr [1] << 8)) << 16;
-        byteptr += 2;
-    }
-
-    if (byteptr < endptr) {
-        if (byteptr + (wps->wphdr.flags & MONO_DATA ? 2 : 4) > endptr)
-            return FALSE;
-
-        wps->w.bitrate_delta [0] = exp2s ((short)(byteptr [0] + (byteptr [1] << 8)));
-        byteptr += 2;
-
-        if (!(wps->wphdr.flags & MONO_DATA)) {
-            wps->w.bitrate_delta [1] = exp2s ((short)(byteptr [0] + (byteptr [1] << 8)));
-            byteptr += 2;
-        }
-
-        if (byteptr < endptr)
-            return FALSE;
-    }
-    else
-        wps->w.bitrate_delta [0] = wps->w.bitrate_delta [1] = 0;
-
-    return TRUE;
-}
-
-// This function is called during both encoding and decoding of hybrid data to
-// update the "error_limit" variable which determines the maximum sample error
-// allowed in the main bitstream. In the HYBRID_BITRATE mode (which is the only
-// currently implemented) this is calculated from the slow_level values and the
-// bitrate accumulators. Note that the bitrate accumulators can be changing.
-
-static void update_error_limit (WavpackStream *wps)
-{
-    int bitrate_0 = (wps->w.bitrate_acc [0] += wps->w.bitrate_delta [0]) >> 16;
-
-    if (wps->wphdr.flags & MONO_DATA) {
-        if (wps->wphdr.flags & HYBRID_BITRATE) {
-            int slow_log_0 = (wps->w.c [0].slow_level + SLO) >> SLS;
-
-            if (slow_log_0 - bitrate_0 > -0x100)
-                wps->w.c [0].error_limit = exp2s (slow_log_0 - bitrate_0 + 0x100);
-            else
-                wps->w.c [0].error_limit = 0;
-        }
-        else
-            wps->w.c [0].error_limit = exp2s (bitrate_0);
-    }
-    else {
-        int bitrate_1 = (wps->w.bitrate_acc [1] += wps->w.bitrate_delta [1]) >> 16;
-
-        if (wps->wphdr.flags & HYBRID_BITRATE) {
-            int slow_log_0 = (wps->w.c [0].slow_level + SLO) >> SLS;
-            int slow_log_1 = (wps->w.c [1].slow_level + SLO) >> SLS;
-
-            if (wps->wphdr.flags & HYBRID_BALANCE) {
-                int balance = (slow_log_1 - slow_log_0 + bitrate_1 + 1) >> 1;
-
-                if (balance > bitrate_0) {
-                    bitrate_1 = bitrate_0 * 2;
-                    bitrate_0 = 0;
-                }
-                else if (-balance > bitrate_0) {
-                    bitrate_0 = bitrate_0 * 2;
-                    bitrate_1 = 0;
-                }
-                else {
-                    bitrate_1 = bitrate_0 + balance;
-                    bitrate_0 = bitrate_0 - balance;
-                }
-            }
-
-            if (slow_log_0 - bitrate_0 > -0x100)
-                wps->w.c [0].error_limit = exp2s (slow_log_0 - bitrate_0 + 0x100);
-            else
-                wps->w.c [0].error_limit = 0;
-
-            if (slow_log_1 - bitrate_1 > -0x100)
-                wps->w.c [1].error_limit = exp2s (slow_log_1 - bitrate_1 + 0x100);
-            else
-                wps->w.c [1].error_limit = 0;
-        }
-        else {
-            wps->w.c [0].error_limit = exp2s (bitrate_0);
-            wps->w.c [1].error_limit = exp2s (bitrate_1);
-        }
-    }
-}
-
-#ifndef NO_PACK
-
-// This function writes the specified word to the open bitstream "wvbits" and,
-// if the bitstream "wvcbits" is open, writes any correction data there. This
-// function will work for either lossless or hybrid but because a version
-// optimized for lossless exits below, it would normally be used for the hybrid
-// mode only. The return value is the actual value stored to the stream (even
-// if a correction file is being created) and is used as feedback to the
-// predictor.
-
-int32_t FASTCALL send_word (WavpackStream *wps, int32_t value, int chan)
-{
-    struct entropy_data *c = wps->w.c + chan;
-    uint32_t ones_count, low, mid, high;
-    int sign = (value < 0) ? 1 : 0;
-
-    if (wps->w.c [0].median [0] < 2 && !wps->w.holding_zero && wps->w.c [1].median [0] < 2) {
-        if (wps->w.zeros_acc) {
-            if (value)
-                flush_word (wps);
-            else {
-                c->slow_level -= (c->slow_level + SLO) >> SLS;
-                wps->w.zeros_acc++;
-                return 0;
-            }
-        }
-        else if (value)
-            putbit_0 (&wps->wvbits);
-        else {
-            c->slow_level -= (c->slow_level + SLO) >> SLS;
-            CLEAR (wps->w.c [0].median);
-            CLEAR (wps->w.c [1].median);
-            wps->w.zeros_acc = 1;
-            return 0;
-        }
-    }
-
-    if (sign)
-        value = ~value;
-
-    if ((wps->wphdr.flags & HYBRID_FLAG) && !chan)
-        update_error_limit (wps);
-
-    if (value < (int32_t) GET_MED (0)) {
-        ones_count = low = 0;
-        high = GET_MED (0) - 1;
-        DEC_MED0 ();
-    }
-    else {
-        low = GET_MED (0);
-        INC_MED0 ();
-
-        if (value - low < GET_MED (1)) {
-            ones_count = 1;
-            high = low + GET_MED (1) - 1;
-            DEC_MED1 ();
-        }
-        else {
-            low += GET_MED (1);
-            INC_MED1 ();
-
-            if (value - low < GET_MED (2)) {
-                ones_count = 2;
-                high = low + GET_MED (2) - 1;
-                DEC_MED2 ();
-            }
-            else {
-                ones_count = 2 + (value - low) / GET_MED (2);
-                low += (ones_count - 2) * GET_MED (2);
-                high = low + GET_MED (2) - 1;
-                INC_MED2 ();
-            }
-        }
-    }
-
-    mid = (high + low + 1) >> 1;
-
-    if (wps->w.holding_zero) {
-        if (ones_count)
-            wps->w.holding_one++;
-
-        flush_word (wps);
-
-        if (ones_count) {
-            wps->w.holding_zero = 1;
-            ones_count--;
-        }
-        else
-            wps->w.holding_zero = 0;
-    }
-    else
-        wps->w.holding_zero = 1;
-
-    wps->w.holding_one = ones_count * 2;
-
-    if (!c->error_limit) {
-        if (high != low) {
-            uint32_t maxcode = high - low, code = value - low;
-            int bitcount = count_bits (maxcode);
-            uint32_t extras = bitset [bitcount] - maxcode - 1;
-
-            if (code < extras) {
-                wps->w.pend_data |= code << wps->w.pend_count;
-                wps->w.pend_count += bitcount - 1;
-            }
-            else {
-                wps->w.pend_data |= ((code + extras) >> 1) << wps->w.pend_count;
-                wps->w.pend_count += bitcount - 1;
-                wps->w.pend_data |= ((code + extras) & 1) << wps->w.pend_count++;
-            }
-        }
-
-        mid = value;
-    }
-    else
-        while (high - low > c->error_limit)
-            if (value < (int32_t) mid) {
-                mid = ((high = mid - 1) + low + 1) >> 1;
-                wps->w.pend_count++;
-            }
-            else {
-                mid = (high + (low = mid) + 1) >> 1;
-                wps->w.pend_data |= bitset [wps->w.pend_count++];
-            }
-
-    wps->w.pend_data |= ((int32_t) sign << wps->w.pend_count++);
-
-    if (!wps->w.holding_zero)
-        flush_word (wps);
-
-    if (bs_is_open (&wps->wvcbits) && c->error_limit) {
-        uint32_t code = value - low, maxcode = high - low;
-        int bitcount = count_bits (maxcode);
-        uint32_t extras = bitset [bitcount] - maxcode - 1;
-
-        if (bitcount) {
-            if (code < extras)
-                putbits (code, bitcount - 1, &wps->wvcbits);
-            else {
-                putbits ((code + extras) >> 1, bitcount - 1, &wps->wvcbits);
-                putbit ((code + extras) & 1, &wps->wvcbits);
-            }
-        }
-    }
-
-    if (wps->wphdr.flags & HYBRID_BITRATE) {
-        c->slow_level -= (c->slow_level + SLO) >> SLS;
-        c->slow_level += mylog2 (mid);
-    }
-
-    return sign ? ~mid : mid;
-}
-
-// This function is an optimized version of send_word() that only handles
-// lossless (error_limit == 0) and sends an entire buffer of either mono or
-// stereo data rather than a single sample. Unlike the generalized
-// send_word(), it does not return values because it always encodes
-// the exact value passed.
-
-void send_words_lossless (WavpackStream *wps, int32_t *buffer, int32_t nsamples)
-{
-    struct entropy_data *c = wps->w.c;
-    int32_t value, csamples;
-
-    if (!(wps->wphdr.flags & MONO_DATA))
-        nsamples *= 2;
-
-    for (csamples = 0; csamples < nsamples; ++csamples) {
-        int sign = ((value = *buffer++) < 0) ? 1 : 0;
-        uint32_t ones_count, low, high;
-
-        if (!(wps->wphdr.flags & MONO_DATA))
-            c = wps->w.c + (csamples & 1);
-
-        if (wps->w.c [0].median [0] < 2 && !wps->w.holding_zero && wps->w.c [1].median [0] < 2) {
-            if (wps->w.zeros_acc) {
-                if (value)
-                    flush_word (wps);
-                else {
-                    wps->w.zeros_acc++;
-                    continue;
-                }
-            }
-            else if (value)
-                putbit_0 (&wps->wvbits);
-            else {
-                CLEAR (wps->w.c [0].median);
-                CLEAR (wps->w.c [1].median);
-                wps->w.zeros_acc = 1;
-                continue;
-            }
-        }
-
-        if (sign)
-            value = ~value;
-
-        if (value < (int32_t) GET_MED (0)) {
-            ones_count = low = 0;
-            high = GET_MED (0) - 1;
-            DEC_MED0 ();
-        }
-        else {
-            low = GET_MED (0);
-            INC_MED0 ();
-
-            if (value - low < GET_MED (1)) {
-                ones_count = 1;
-                high = low + GET_MED (1) - 1;
-                DEC_MED1 ();
-            }
-            else {
-                low += GET_MED (1);
-                INC_MED1 ();
-
-                if (value - low < GET_MED (2)) {
-                    ones_count = 2;
-                    high = low + GET_MED (2) - 1;
-                    DEC_MED2 ();
-                }
-                else {
-                    ones_count = 2 + (value - low) / GET_MED (2);
-                    low += (ones_count - 2) * GET_MED (2);
-                    high = low + GET_MED (2) - 1;
-                    INC_MED2 ();
-                }
-            }
-        }
-
-        if (wps->w.holding_zero) {
-            if (ones_count)
-                wps->w.holding_one++;
-
-            flush_word (wps);
-
-            if (ones_count) {
-                wps->w.holding_zero = 1;
-                ones_count--;
-            }
-            else
-                wps->w.holding_zero = 0;
-        }
-        else
-            wps->w.holding_zero = 1;
-
-        wps->w.holding_one = ones_count * 2;
-
-        if (high != low) {
-            uint32_t maxcode = high - low, code = value - low;
-            int bitcount = count_bits (maxcode);
-            uint32_t extras = bitset [bitcount] - maxcode - 1;
-
-            if (code < extras) {
-                wps->w.pend_data |= code << wps->w.pend_count;
-                wps->w.pend_count += bitcount - 1;
-            }
-            else {
-                wps->w.pend_data |= ((code + extras) >> 1) << wps->w.pend_count;
-                wps->w.pend_count += bitcount - 1;
-                wps->w.pend_data |= ((code + extras) & 1) << wps->w.pend_count++;
-            }
-        }
-
-        wps->w.pend_data |= ((int32_t) sign << wps->w.pend_count++);
-
-        if (!wps->w.holding_zero)
-            flush_word (wps);
-    }
-}
-
-// Used by send_word() and send_word_lossless() to actually send most the
-// accumulated data onto the bitstream. This is also called directly from
-// clients when all words have been sent.
-
-void flush_word (WavpackStream *wps)
-{
-    if (wps->w.zeros_acc) {
-        int cbits = count_bits (wps->w.zeros_acc);
-
-        while (cbits--)
-            putbit_1 (&wps->wvbits);
-
-        putbit_0 (&wps->wvbits);
-
-        while (wps->w.zeros_acc > 1) {
-            putbit (wps->w.zeros_acc & 1, &wps->wvbits);
-            wps->w.zeros_acc >>= 1;
-        }
-
-        wps->w.zeros_acc = 0;
-    }
-
-    if (wps->w.holding_one) {
-#ifdef LIMIT_ONES
-        if (wps->w.holding_one >= LIMIT_ONES) {
-            int cbits;
-
-            putbits ((1L << LIMIT_ONES) - 1, LIMIT_ONES + 1, &wps->wvbits);
-            wps->w.holding_one -= LIMIT_ONES;
-            cbits = count_bits (wps->w.holding_one);
-
-            while (cbits--)
-                putbit_1 (&wps->wvbits);
-
-            putbit_0 (&wps->wvbits);
-
-            while (wps->w.holding_one > 1) {
-                putbit (wps->w.holding_one & 1, &wps->wvbits);
-                wps->w.holding_one >>= 1;
-            }
-
-            wps->w.holding_zero = 0;
-        }
-        else
-            putbits (bitmask [wps->w.holding_one], wps->w.holding_one, &wps->wvbits);
-
-        wps->w.holding_one = 0;
-#else
-        do {
-            putbit_1 (&wps->wvbits);
-        } while (--wps->w.holding_one);
-#endif
-    }
-
-    if (wps->w.holding_zero) {
-        putbit_0 (&wps->wvbits);
-        wps->w.holding_zero = 0;
-    }
-
-    if (wps->w.pend_count) {
-        putbits (wps->w.pend_data, wps->w.pend_count, &wps->wvbits);
-        wps->w.pend_data = wps->w.pend_count = 0;
-    }
-}
-
-// This function is similar to send_word() except that no data is actually
-// written to any stream, but it does return the value that would have been
-// sent to a hybrid stream. It is used to determine beforehand how much noise
-// will be added to samples.
-
-int32_t nosend_word (WavpackStream *wps, int32_t value, int chan)
-{
-    struct entropy_data *c = wps->w.c + chan;
-    uint32_t ones_count, low, mid, high;
-    int sign = (value < 0) ? 1 : 0;
-
-    if (sign)
-        value = ~value;
-
-    if ((wps->wphdr.flags & HYBRID_FLAG) && !chan)
-        update_error_limit (wps);
-
-    if (value < (int32_t) GET_MED (0)) {
-        low = 0;
-        high = GET_MED (0) - 1;
-        DEC_MED0 ();
-    }
-    else {
-        low = GET_MED (0);
-        INC_MED0 ();
-
-        if (value - low < GET_MED (1)) {
-            high = low + GET_MED (1) - 1;
-            DEC_MED1 ();
-        }
-        else {
-            low += GET_MED (1);
-            INC_MED1 ();
-
-            if (value - low < GET_MED (2)) {
-                high = low + GET_MED (2) - 1;
-                DEC_MED2 ();
-            }
-            else {
-                ones_count = 2 + (value - low) / GET_MED (2);
-                low += (ones_count - 2) * GET_MED (2);
-                high = low + GET_MED (2) - 1;
-                INC_MED2 ();
-            }
-        }
-    }
-
-    mid = (high + low + 1) >> 1;
-
-    if (!c->error_limit)
-        mid = value;
-    else
-        while (high - low > c->error_limit)
-            if (value < (int32_t) mid)
-                mid = ((high = mid - 1) + low + 1) >> 1;
-            else
-                mid = (high + (low = mid) + 1) >> 1;
-
-    c->slow_level -= (c->slow_level + SLO) >> SLS;
-    c->slow_level += mylog2 (mid);
-
-    return sign ? ~mid : mid;
-}
-
-// This function is used to scan some number of samples to set the variables
-// "slow_level" and the "median" array. In pure symetrical encoding mode this
-// would not be needed because these values would simply be continued from the
-// previous block. However, in the -X modes and the 32-bit modes we cannot do
-// this because parameters may change between blocks and the variables might
-// not apply. This function can work in mono or stereo and can scan a block
-// in either direction.
-
-void scan_word (WavpackStream *wps, int32_t *samples, uint32_t num_samples, int dir)
-{
-    uint32_t flags = wps->wphdr.flags, value, low;
-    struct entropy_data *c = wps->w.c;
-    int chan;
-
-    init_words (wps);
-
-    if (flags & MONO_DATA) {
-        if (dir < 0) {
-            samples += (num_samples - 1);
-            dir = -1;
-        }
-        else
-            dir = 1;
-    }
-    else {
-        if (dir < 0) {
-            samples += (num_samples - 1) * 2;
-            dir = -2;
-        }
-        else
-            dir = 2;
-    }
-
-    while (num_samples--) {
-
-        value = labs (samples [chan = 0]);
-
-        if (flags & HYBRID_BITRATE) {
-            wps->w.c [0].slow_level -= (wps->w.c [0].slow_level + SLO) >> SLS;
-            wps->w.c [0].slow_level += mylog2 (value);
-        }
-
-        if (value < GET_MED (0)) {
-            DEC_MED0 ();
-        }
-        else {
-            low = GET_MED (0);
-            INC_MED0 ();
-
-            if (value - low < GET_MED (1)) {
-                DEC_MED1 ();
-            }
-            else {
-                low += GET_MED (1);
-                INC_MED1 ();
-
-                if (value - low < GET_MED (2)) {
-                    DEC_MED2 ();
-                }
-                else {
-                    INC_MED2 ();
-                }
-            }
-        }
-
-        if (!(flags & MONO_DATA)) {
-            value = labs (samples [chan = 1]);
-            c++;
-
-            if (wps->wphdr.flags & HYBRID_BITRATE) {
-                wps->w.c [1].slow_level -= (wps->w.c [1].slow_level + SLO) >> SLS;
-                wps->w.c [1].slow_level += mylog2 (value);
-            }
-
-            if (value < GET_MED (0)) {
-                DEC_MED0 ();
-            }
-            else {
-                low = GET_MED (0);
-                INC_MED0 ();
-
-                if (value - low < GET_MED (1)) {
-                    DEC_MED1 ();
-                }
-                else {
-                    low += GET_MED (1);
-                    INC_MED1 ();
-
-                    if (value - low < GET_MED (2)) {
-                        DEC_MED2 ();
-                    }
-                    else {
-                        INC_MED2 ();
-                    }
-                }
-            }
-
-            c--;
-        }
-
-        samples += dir;
-    }
-}
-
-#endif
-
-#ifndef NO_UNPACK
-
-static uint32_t FASTCALL read_code (Bitstream *bs, uint32_t maxcode);
-
-// Read the next word from the bitstream "wvbits" and return the value. This
-// function can be used for hybrid or lossless streams, but since an
-// optimized version is available for lossless this function would normally
-// be used for hybrid only. If a hybrid lossless stream is being read then
-// the "correction" offset is written at the specified pointer. A return value
-// of WORD_EOF indicates that the end of the bitstream was reached (all 1s) or
-// some other error occurred.
-
-int32_t FASTCALL get_word (WavpackStream *wps, int chan, int32_t *correction)
-{
-    register struct entropy_data *c = wps->w.c + chan;
-    uint32_t ones_count, low, mid, high;
-    int next8, sign;
-    int32_t value;
-
-    if (correction)
-        *correction = 0;
-
-    if (!(wps->w.c [0].median [0] & ~1) && !wps->w.holding_zero && !wps->w.holding_one && !(wps->w.c [1].median [0] & ~1)) {
-        uint32_t mask;
-        int cbits;
-
-        if (wps->w.zeros_acc) {
-            if (--wps->w.zeros_acc) {
-                c->slow_level -= (c->slow_level + SLO) >> SLS;
-                return 0;
-            }
-        }
-        else {
-            for (cbits = 0; cbits < 33 && getbit (&wps->wvbits); ++cbits);
-
-            if (cbits == 33)
-                return WORD_EOF;
-
-            if (cbits < 2)
-                wps->w.zeros_acc = cbits;
-            else {
-                for (mask = 1, wps->w.zeros_acc = 0; --cbits; mask <<= 1)
-                    if (getbit (&wps->wvbits))
-                        wps->w.zeros_acc |= mask;
-
-                wps->w.zeros_acc |= mask;
-            }
-
-            if (wps->w.zeros_acc) {
-                c->slow_level -= (c->slow_level + SLO) >> SLS;
-                CLEAR (wps->w.c [0].median);
-                CLEAR (wps->w.c [1].median);
-                return 0;
-            }
-        }
-    }
-
-    if (wps->w.holding_zero)
-        ones_count = wps->w.holding_zero = 0;
-    else {
-#ifdef USE_NEXT8_OPTIMIZATION
-        if (wps->wvbits.bc < 8) {
-            if (++(wps->wvbits.ptr) == wps->wvbits.end)
-                wps->wvbits.wrap (&wps->wvbits);
-
-            next8 = (wps->wvbits.sr |= *(wps->wvbits.ptr) << wps->wvbits.bc) & 0xff;
-            wps->wvbits.bc += sizeof (*(wps->wvbits.ptr)) * 8;
-        }
-        else
-            next8 = wps->wvbits.sr & 0xff;
-
-        if (next8 == 0xff) {
-            wps->wvbits.bc -= 8;
-            wps->wvbits.sr >>= 8;
-
-            for (ones_count = 8; ones_count < (LIMIT_ONES + 1) && getbit (&wps->wvbits); ++ones_count);
-
-            if (ones_count == (LIMIT_ONES + 1))
-                return WORD_EOF;
-
-            if (ones_count == LIMIT_ONES) {
-                uint32_t mask;
-                int cbits;
-
-                for (cbits = 0; cbits < 33 && getbit (&wps->wvbits); ++cbits);
-
-                if (cbits == 33)
-                    return WORD_EOF;
-
-                if (cbits < 2)
-                    ones_count = cbits;
-                else {
-                    for (mask = 1, ones_count = 0; --cbits; mask <<= 1)
-                        if (getbit (&wps->wvbits))
-                            ones_count |= mask;
-
-                    ones_count |= mask;
-                }
-
-                ones_count += LIMIT_ONES;
-            }
-        }
-        else {
-            wps->wvbits.bc -= (ones_count = ones_count_table [next8]) + 1;
-            wps->wvbits.sr >>= ones_count + 1;
-        }
-#else
-        for (ones_count = 0; ones_count < (LIMIT_ONES + 1) && getbit (&wps->wvbits); ++ones_count);
-
-        if (ones_count >= LIMIT_ONES) {
-            uint32_t mask;
-            int cbits;
-
-            if (ones_count == (LIMIT_ONES + 1))
-                return WORD_EOF;
-
-            for (cbits = 0; cbits < 33 && getbit (&wps->wvbits); ++cbits);
-
-            if (cbits == 33)
-                return WORD_EOF;
-
-            if (cbits < 2)
-                ones_count = cbits;
-            else {
-                for (mask = 1, ones_count = 0; --cbits; mask <<= 1)
-                    if (getbit (&wps->wvbits))
-                        ones_count |= mask;
-
-                ones_count |= mask;
-            }
-
-            ones_count += LIMIT_ONES;
-        }
-#endif
-
-        if (wps->w.holding_one) {
-            wps->w.holding_one = ones_count & 1;
-            ones_count = (ones_count >> 1) + 1;
-        }
-        else {
-            wps->w.holding_one = ones_count & 1;
-            ones_count >>= 1;
-        }
-
-        wps->w.holding_zero = ~wps->w.holding_one & 1;
-    }
-
-    if ((wps->wphdr.flags & HYBRID_FLAG) && !chan)
-        update_error_limit (wps);
-
-    if (ones_count == 0) {
-        low = 0;
-        high = GET_MED (0) - 1;
-        DEC_MED0 ();
-    }
-    else {
-        low = GET_MED (0);
-        INC_MED0 ();
-
-        if (ones_count == 1) {
-            high = low + GET_MED (1) - 1;
-            DEC_MED1 ();
-        }
-        else {
-            low += GET_MED (1);
-            INC_MED1 ();
-
-            if (ones_count == 2) {
-                high = low + GET_MED (2) - 1;
-                DEC_MED2 ();
-            }
-            else {
-                low += (ones_count - 2) * GET_MED (2);
-                high = low + GET_MED (2) - 1;
-                INC_MED2 ();
-            }
-        }
-    }
-
-    low &= 0x7fffffff;
-    high &= 0x7fffffff;
-    mid = (high + low + 1) >> 1;
-
-    if (!c->error_limit)
-        mid = read_code (&wps->wvbits, high - low) + low;
-    else while (high - low > c->error_limit) {
-        if (getbit (&wps->wvbits))
-            mid = (high + (low = mid) + 1) >> 1;
-        else
-            mid = ((high = mid - 1) + low + 1) >> 1;
-    }
-
-    sign = getbit (&wps->wvbits);
-
-    if (bs_is_open (&wps->wvcbits) && c->error_limit) {
-        value = read_code (&wps->wvcbits, high - low) + low;
-
-        if (correction)
-            *correction = sign ? (mid - value) : (value - mid);
-    }
-
-    if (wps->wphdr.flags & HYBRID_BITRATE) {
-        c->slow_level -= (c->slow_level + SLO) >> SLS;
-        c->slow_level += mylog2 (mid);
-    }
-
-    return sign ? ~mid : mid;
-}
-
-// This is an optimized version of get_word() that is used for lossless only
-// (error_limit == 0). Also, rather than obtaining a single sample, it can be
-// used to obtain an entire buffer of either mono or stereo samples.
-
-int32_t get_words_lossless (WavpackStream *wps, int32_t *buffer, int32_t nsamples)
-{
-    struct entropy_data *c = wps->w.c;
-    uint32_t ones_count, low, high;
-    Bitstream *bs = &wps->wvbits;
-    int32_t csamples;
-
-    if (!(wps->wphdr.flags & MONO_DATA))
-        nsamples *= 2;
-
-    for (csamples = 0; csamples < nsamples; ++csamples) {
-        if (!(wps->wphdr.flags & MONO_DATA))
-            c = wps->w.c + (csamples & 1);
-
-        if (wps->w.c [0].median [0] < 2 && !wps->w.holding_zero && !wps->w.holding_one && wps->w.c [1].median [0] < 2) {
-            uint32_t mask;
-            int cbits;
-
-            if (wps->w.zeros_acc) {
-                if (--wps->w.zeros_acc) {
-                    *buffer++ = 0;
-                    continue;
-                }
-            }
-            else {
-                for (cbits = 0; cbits < 33 && getbit (bs); ++cbits);
-
-                if (cbits == 33)
-                    break;
-
-                if (cbits < 2)
-                    wps->w.zeros_acc = cbits;
-                else {
-                    for (mask = 1, wps->w.zeros_acc = 0; --cbits; mask <<= 1)
-                        if (getbit (bs))
-                            wps->w.zeros_acc |= mask;
-
-                    wps->w.zeros_acc |= mask;
-                }
-
-                if (wps->w.zeros_acc) {
-                    CLEAR (wps->w.c [0].median);
-                    CLEAR (wps->w.c [1].median);
-                    *buffer++ = 0;
-                    continue;
-                }
-            }
-        }
-
-        if (wps->w.holding_zero)
-            ones_count = wps->w.holding_zero = 0;
-        else {
-#ifdef USE_NEXT8_OPTIMIZATION
-            int next8;
-
-            if (bs->bc < 8) {
-                if (++(bs->ptr) == bs->end)
-                    bs->wrap (bs);
-
-                next8 = (bs->sr |= *(bs->ptr) << bs->bc) & 0xff;
-                bs->bc += sizeof (*(bs->ptr)) * 8;
-            }
-            else
-                next8 = bs->sr & 0xff;
-
-            if (next8 == 0xff) {
-                bs->bc -= 8;
-                bs->sr >>= 8;
-
-                for (ones_count = 8; ones_count < (LIMIT_ONES + 1) && getbit (bs); ++ones_count);
-
-                if (ones_count == (LIMIT_ONES + 1))
-                    break;
-
-                if (ones_count == LIMIT_ONES) {
-                    uint32_t mask;
-                    int cbits;
-
-                    for (cbits = 0; cbits < 33 && getbit (bs); ++cbits);
-
-                    if (cbits == 33)
-                        break;
-
-                    if (cbits < 2)
-                        ones_count = cbits;
-                    else {
-                        for (mask = 1, ones_count = 0; --cbits; mask <<= 1)
-                            if (getbit (bs))
-                                ones_count |= mask;
-
-                        ones_count |= mask;
-                    }
-
-                    ones_count += LIMIT_ONES;
-                }
-            }
-            else {
-                bs->bc -= (ones_count = ones_count_table [next8]) + 1;
-                bs->sr >>= ones_count + 1;
-            }
-#else
-            for (ones_count = 0; ones_count < (LIMIT_ONES + 1) && getbit (bs); ++ones_count);
-
-            if (ones_count >= LIMIT_ONES) {
-                uint32_t mask;
-                int cbits;
-
-                if (ones_count == (LIMIT_ONES + 1))
-                    break;
-
-                for (cbits = 0; cbits < 33 && getbit (bs); ++cbits);
-
-                if (cbits == 33)
-                    break;
-
-                if (cbits < 2)
-                    ones_count = cbits;
-                else {
-                    for (mask = 1, ones_count = 0; --cbits; mask <<= 1)
-                        if (getbit (bs))
-                            ones_count |= mask;
-
-                    ones_count |= mask;
-                }
-
-                ones_count += LIMIT_ONES;
-            }
-#endif
-            if (wps->w.holding_one) {
-                wps->w.holding_one = ones_count & 1;
-                ones_count = (ones_count >> 1) + 1;
-            }
-            else {
-                wps->w.holding_one = ones_count & 1;
-                ones_count >>= 1;
-            }
-
-            wps->w.holding_zero = ~wps->w.holding_one & 1;
-        }
-
-        if (ones_count == 0) {
-            low = 0;
-            high = GET_MED (0) - 1;
-            DEC_MED0 ();
-        }
-        else {
-            low = GET_MED (0);
-            INC_MED0 ();
-
-            if (ones_count == 1) {
-                high = low + GET_MED (1) - 1;
-                DEC_MED1 ();
-            }
-            else {
-                low += GET_MED (1);
-                INC_MED1 ();
-
-                if (ones_count == 2) {
-                    high = low + GET_MED (2) - 1;
-                    DEC_MED2 ();
-                }
-                else {
-                    low += (ones_count - 2) * GET_MED (2);
-                    high = low + GET_MED (2) - 1;
-                    INC_MED2 ();
-                }
-            }
-        }
-
-        low += read_code (bs, high - low);
-        *buffer++ = (getbit (bs)) ? ~low : low;
-    }
-
-    return (wps->wphdr.flags & MONO_DATA) ? csamples : (csamples / 2);
-}
-
-// Read a single unsigned value from the specified bitstream with a value
-// from 0 to maxcode. If there are exactly a power of two number of possible
-// codes then this will read a fixed number of bits; otherwise it reads the
-// minimum number of bits and then determines whether another bit is needed
-// to define the code.
-
-static uint32_t FASTCALL read_code (Bitstream *bs, uint32_t maxcode)
-{
-    uint32_t extras, code;
-    int bitcount;
-
-    if (maxcode < 2)
-        return maxcode ? getbit (bs) : 0;
-
-    bitcount = count_bits (maxcode);
-    extras = bitset [bitcount] - maxcode - 1;
-
-    while (bs->bc < bitcount) {
-        if (++(bs->ptr) == bs->end)
-            bs->wrap (bs);
-
-        bs->sr |= *(bs->ptr) << bs->bc;
-        bs->bc += sizeof (*(bs->ptr)) * 8;
-    }
-
-    if ((code = bs->sr & bitmask [bitcount - 1]) >= extras)
-        code = (code << 1) - extras + ((bs->sr >> (bitcount - 1)) & 1);
-    else
-        bitcount--;
-
-    if (bs->bc > 32) {
-        bs->bc -= bitcount;
-        bs->sr = *(bs->ptr) >> (sizeof (*(bs->ptr)) * 8 - bs->bc);
-    }
-    else {
-        bs->sr >>= bitcount;
-        bs->bc -= bitcount;
-    }
-
-    return code;
-}
-
-#endif
-
-// The concept of a base 2 logarithm is used in many parts of WavPack. It is
-// a way of sufficiently accurately representing 32-bit signed and unsigned
-// values storing only 16 bits (actually fewer). It is also used in the hybrid
-// mode for quickly comparing the relative magnitude of large values (i.e.
-// division) and providing smooth exponentials using only addition.
-
-// These are not strict logarithms in that they become linear around zero and
-// can therefore represent both zero and negative values. They have 8 bits
-// of precision and in "roundtrip" conversions the total error never exceeds 1
-// part in 225 except for the cases of +/-115 and +/-195 (which error by 1).
-
-
-// This function returns the log2 for the specified 32-bit unsigned value.
-// The maximum value allowed is about 0xff800000 and returns 8447.
-
-static int FASTCALL mylog2 (uint32_t avalue)
-{
-    int dbits;
-
-    if ((avalue += avalue >> 9) < (1 << 8)) {
-        dbits = nbits_table [avalue];
-        return (dbits << 8) + log2_table [(avalue << (9 - dbits)) & 0xff];
-    }
-    else {
-        if (avalue < (1L << 16))
-            dbits = nbits_table [avalue >> 8] + 8;
-        else if (avalue < (1L << 24))
-            dbits = nbits_table [avalue >> 16] + 16;
-        else
-            dbits = nbits_table [avalue >> 24] + 24;
-
-        return (dbits << 8) + log2_table [(avalue >> (dbits - 9)) & 0xff];
-    }
-}
-
-// This function scans a buffer of longs and accumulates the total log2 value
-// of all the samples. This is useful for determining maximum compression
-// because the bitstream storage required for entropy coding is proportional
-// to the base 2 log of the samples.
-
-uint32_t log2buffer (int32_t *samples, uint32_t num_samples, int limit)
-{
-    uint32_t result = 0, avalue;
-    int dbits;
-
-    while (num_samples--) {
-        avalue = abs (*samples++);
-
-        if ((avalue += avalue >> 9) < (1 << 8)) {
-            dbits = nbits_table [avalue];
-            result += (dbits << 8) + log2_table [(avalue << (9 - dbits)) & 0xff];
-        }
-        else {
-            if (avalue < (1L << 16))
-                dbits = nbits_table [avalue >> 8] + 8;
-            else if (avalue < (1L << 24))
-                dbits = nbits_table [avalue >> 16] + 16;
-            else
-                dbits = nbits_table [avalue >> 24] + 24;
-
-            result += dbits = (dbits << 8) + log2_table [(avalue >> (dbits - 9)) & 0xff];
-
-            if (limit && dbits >= limit)
-                return (uint32_t) -1;
-        }
-    }
-
-    return result;
-}
-
-// This function returns the log2 for the specified 32-bit signed value.
-// All input values are valid and the return values are in the range of
-// +/- 8192.
-
-int log2s (int32_t value)
-{
-    return (value < 0) ? -mylog2 (-value) : mylog2 (value);
-}
-
-// This function returns the original integer represented by the supplied
-// logarithm (at least within the provided accuracy). The log is signed,
-// but since a full 32-bit value is returned this can be used for unsigned
-// conversions as well (i.e. the input range is -8192 to +8447).
-
-int32_t exp2s (int log)
-{
-    uint32_t value;
-
-    if (log < 0)
-        return -exp2s (-log);
-
-    value = exp2_table [log & 0xff] | 0x100;
-
-    if ((log >>= 8) <= 9)
-        return value >> (9 - log);
-    else
-        return value << (log - 9);
-}
-
-// These two functions convert internal weights (which are normally +/-1024)
-// to and from an 8-bit signed character version for storage in metadata. The
-// weights are clipped here in the case that they are outside that range.
-
-signed char store_weight (int weight)
-{
-    if (weight > 1024)
-        weight = 1024;
-    else if (weight < -1024)
-        weight = -1024;
-
-    if (weight > 0)
-        weight -= (weight + 64) >> 7;
-
-    return (weight + 4) >> 3;
-}
-
-int restore_weight (signed char weight)
-{
-    int result;
-
-    if ((result = (int) weight << 3) > 0)
-        result += (result + 64) >> 7;
-
-    return result;
-}
diff --git a/third_party/wavpack/src/wputils.c b/third_party/wavpack/src/wputils.c
deleted file mode 100644
index 5d30f1e..0000000
--- a/third_party/wavpack/src/wputils.c
+++ /dev/null
@@ -1,2350 +0,0 @@
-////////////////////////////////////////////////////////////////////////////
-//                           **** WAVPACK ****                            //
-//                  Hybrid Lossless Wavefile Compressor                   //
-//              Copyright (c) 1998 - 2006 Conifer Software.               //
-//                          All Rights Reserved.                          //
-//      Distributed under the BSD Software License (see license.txt)      //
-////////////////////////////////////////////////////////////////////////////
-
-// wputils.c
-
-// This module provides a high-level interface to reading and writing WavPack
-// files. WavPack input files can be opened as standard "C" streams using a
-// provided filename. However, an alternate entry uses stream-reading
-// callbacks to make using another file I/O method easy. Note that in this
-// case the user application is responsible for finding and opening the .wvc
-// file if the use of them is desired.
-
-// For writing WavPack files there are no I/O routines used; a callback for
-// writing completed blocks is provided.
-
-#include <stdlib.h>
-#include <fcntl.h>
-#include <string.h>
-#include <math.h>
-#include <sys/stat.h>
-
-#if defined (WIN32) || defined (__OS2__)
-#include <io.h>
-#endif
-
-#ifndef LIBWAVPACK_VERSION_STRING
-#include "wavpack_version.h"
-#endif
-
-#include "wavpack_local.h"
-
-#ifdef WIN32
-#define stricmp(x,y) _stricmp(x,y)
-#define fileno _fileno
-#else
-#define stricmp strcasecmp
-#endif
-
-#ifdef DEBUG_ALLOC
-#define malloc malloc_db
-#define realloc realloc_db
-#define free free_db
-void *malloc_db (uint32_t size);
-void *realloc_db (void *ptr, uint32_t size);
-void free_db (void *ptr);
-int32_t dump_alloc (void);
-#endif
-
-static void free_streams (WavpackContext *wpc);
-
-///////////////////////////// local table storage ////////////////////////////
-
-static const uint32_t sample_rates [] = { 6000, 8000, 9600, 11025, 12000, 16000, 22050,
-    24000, 32000, 44100, 48000, 64000, 88200, 96000, 192000 };
-
-///////////////////////////// executable code ////////////////////////////////
-
-#if !defined(NO_UNPACK) || defined(INFO_ONLY)
-
-static uint32_t read_next_header (WavpackStreamReader *reader, void *id, WavpackHeader *wphdr);
-static uint32_t seek_final_index (WavpackStreamReader *reader, void *id);
-static int read_wvc_block (WavpackContext *wpc);
-
-// This code provides an interface between the reader callback mechanism that
-// WavPack uses internally and the standard fstream C library.
-
-#ifndef NO_USE_FSTREAMS
-
-static int32_t read_bytes (void *id, void *data, int32_t bcount)
-{
-    return (int32_t) fread (data, 1, bcount, (FILE*) id);
-}
-
-static uint32_t get_pos (void *id)
-{
-    return ftell ((FILE*) id);
-}
-
-static int set_pos_abs (void *id, uint32_t pos)
-{
-    return fseek (id, pos, SEEK_SET);
-}
-
-static int set_pos_rel (void *id, int32_t delta, int mode)
-{
-    return fseek (id, delta, mode);
-}
-
-static int push_back_byte (void *id, int c)
-{
-    return ungetc (c, id);
-}
-
-static uint32_t get_length (void *id)
-{
-    FILE *file = id;
-    struct stat statbuf;
-
-    if (!file || fstat (fileno (file), &statbuf) || !(statbuf.st_mode & S_IFREG))
-        return 0;
-
-    return statbuf.st_size;
-}
-
-static int can_seek (void *id)
-{
-    FILE *file = id;
-    struct stat statbuf;
-
-    return file && !fstat (fileno (file), &statbuf) && (statbuf.st_mode & S_IFREG);
-}
-
-static int32_t write_bytes (void *id, void *data, int32_t bcount)
-{
-    return (int32_t) fwrite (data, 1, bcount, (FILE*) id);
-}
-
-static WavpackStreamReader freader = {
-    read_bytes, get_pos, set_pos_abs, set_pos_rel, push_back_byte, get_length, can_seek,
-    write_bytes
-};
-
-// This function attempts to open the specified WavPack file for reading. If
-// this fails for any reason then an appropriate message is copied to "error"
-// (which must accept 80 characters) and NULL is returned, otherwise a
-// pointer to a WavpackContext structure is returned (which is used to call
-// all other functions in this module). A filename beginning with "-" is
-// assumed to be stdin. The "flags" argument has the following bit mask
-// values to specify details of the open operation:
-
-// OPEN_WVC:  attempt to open/read "correction" file
-// OPEN_TAGS:  attempt to read ID3v1 / APEv2 tags (requires seekable file)
-// OPEN_WRAPPER:  make audio wrapper available (i.e. RIFF) to caller
-// OPEN_2CH_MAX:  open only first stream of multichannel file (usually L/R)
-// OPEN_NORMALIZE:  normalize floating point data to +/- 1.0 (w/ offset exp)
-// OPEN_STREAMING:  blindly unpacks blocks w/o regard to header file position
-// OPEN_EDIT_TAGS:  allow editing of tags (file must be writable)
-
-// Version 4.2 of the WavPack library adds the OPEN_STREAMING flag. This is
-// essentially a "raw" mode where the library will simply decode any blocks
-// fed it through the reader callback, regardless of where those blocks came
-// from in a stream. The only requirement is that complete WavPack blocks are
-// fed to the decoder (and this may require multiple blocks in multichannel
-// mode) and that complete blocks are decoded (even if all samples are not
-// actually required). All the blocks must contain the same number of channels
-// and bit resolution, and the correction data must be either present or not.
-// All other parameters may change from block to block (like lossy/lossless).
-// Obviously, in this mode any seeking must be performed by the application
-// (and again, decoding must start at the beginning of the block containing
-// the seek sample).
-
-WavpackContext *WavpackOpenFileInput (const char *infilename, char *error, int flags, int norm_offset)
-{
-    char *file_mode = (flags & OPEN_EDIT_TAGS) ? "r+b" : "rb";
-    FILE *wv_id, *wvc_id;
-    WavpackContext *wpc;
-
-    if (*infilename == '-') {
-        wv_id = stdin;
-#if defined(WIN32)
-        _setmode (fileno (stdin), O_BINARY);
-#endif
-#if defined(__OS2__)
-        setmode (fileno (stdin), O_BINARY);
-#endif
-    }
-    else if ((wv_id = fopen (infilename, file_mode)) == NULL) {
-        if (error) strcpy (error, (flags & OPEN_EDIT_TAGS) ? "can't open file for editing" : "can't open file");
-        return NULL;
-    }
-
-    if (wv_id != stdin && (flags & OPEN_WVC)) {
-        char *in2filename = malloc (strlen (infilename) + 10);
-
-        strcpy (in2filename, infilename);
-        strcat (in2filename, "c");
-        wvc_id = fopen (in2filename, "rb");
-        free (in2filename);
-    }
-    else
-        wvc_id = NULL;
-
-    wpc = WavpackOpenFileInputEx (&freader, wv_id, wvc_id, error, flags, norm_offset);
-
-    if (!wpc) {
-        if (wv_id)
-            fclose (wv_id);
-
-        if (wvc_id)
-            fclose (wvc_id);
-    }
-    else
-        wpc->close_files = TRUE;
-
-    return wpc;
-}
-
-#endif
-
-// This function is identical to WavpackOpenFileInput() except that instead
-// of providing a filename to open, the caller provides a pointer to a set of
-// reader callbacks and instances of up to two streams. The first of these
-// streams is required and contains the regular WavPack data stream; the second
-// contains the "correction" file if desired. Unlike the standard open
-// function which handles the correction file transparently, in this case it
-// is the responsibility of the caller to be aware of correction files.
-
-WavpackContext *WavpackOpenFileInputEx (WavpackStreamReader *reader, void *wv_id, void *wvc_id, char *error, int flags, int norm_offset)
-{
-    WavpackContext *wpc = malloc (sizeof (WavpackContext));
-    WavpackStream *wps;
-    int num_blocks = 0;
-    unsigned char first_byte;
-    uint32_t bcount;
-
-    if (!wpc) {
-        if (error) strcpy (error, "can't allocate memory");
-        return NULL;
-    }
-
-    CLEAR (*wpc);
-    wpc->wv_in = wv_id;
-    wpc->wvc_in = wvc_id;
-    wpc->reader = reader;
-    wpc->total_samples = (uint32_t) -1;
-    wpc->norm_offset = norm_offset;
-    wpc->max_streams = OLD_MAX_STREAMS;     // use this until overwritten with actual number
-    wpc->open_flags = flags;
-
-    wpc->filelen = wpc->reader->get_length (wpc->wv_in);
-
-#ifndef NO_TAGS
-    if ((flags & (OPEN_TAGS | OPEN_EDIT_TAGS)) && wpc->reader->can_seek (wpc->wv_in)) {
-        load_tag (wpc);
-        wpc->reader->set_pos_abs (wpc->wv_in, 0);
-
-        if ((flags & OPEN_EDIT_TAGS) && !editable_tag (&wpc->m_tag)) {
-            if (error) strcpy (error, "can't edit tags located at the beginning of files!");
-            return WavpackCloseFile (wpc);
-        }
-    }
-#endif
-
-#ifndef VER4_ONLY
-    if (wpc->reader->read_bytes (wpc->wv_in, &first_byte, 1) != 1) {
-        if (error) strcpy (error, "can't read all of WavPack file!");
-        return WavpackCloseFile (wpc);
-    }
-
-    wpc->reader->push_back_byte (wpc->wv_in, first_byte);
-
-    if (first_byte == 'R')
-        return open_file3 (wpc, error);
-#endif
-
-    wpc->streams = malloc ((wpc->num_streams = 1) * sizeof (wpc->streams [0]));
-    wpc->streams [0] = wps = malloc (sizeof (WavpackStream));
-    CLEAR (*wps);
-
-    while (!wps->wphdr.block_samples) {
-
-        wpc->filepos = wpc->reader->get_pos (wpc->wv_in);
-        bcount = read_next_header (wpc->reader, wpc->wv_in, &wps->wphdr);
-
-        if (bcount == (uint32_t) -1 ||
-            (!wps->wphdr.block_samples && num_blocks++ > 16)) {
-                if (error) strcpy (error, "not compatible with this version of WavPack file!");
-                return WavpackCloseFile (wpc);
-        }
-
-        wpc->filepos += bcount;
-        wps->blockbuff = malloc (wps->wphdr.ckSize + 8);
-        memcpy (wps->blockbuff, &wps->wphdr, 32);
-
-        if (wpc->reader->read_bytes (wpc->wv_in, wps->blockbuff + 32, wps->wphdr.ckSize - 24) != wps->wphdr.ckSize - 24) {
-            if (error) strcpy (error, "can't read all of WavPack file!");
-            return WavpackCloseFile (wpc);
-        }
-
-        wps->init_done = FALSE;
-
-        if (wps->wphdr.block_samples && !(flags & OPEN_STREAMING)) {
-            if (wps->wphdr.block_index || wps->wphdr.total_samples == (uint32_t) -1) {
-                wpc->initial_index = wps->wphdr.block_index;
-                wps->wphdr.block_index = 0;
-
-                if (wpc->reader->can_seek (wpc->wv_in)) {
-                    uint32_t pos_save = wpc->reader->get_pos (wpc->wv_in);
-                    uint32_t final_index = seek_final_index (wpc->reader, wpc->wv_in);
-
-                    if (final_index != (uint32_t) -1)
-                        wpc->total_samples = final_index - wpc->initial_index;
-
-                    wpc->reader->set_pos_abs (wpc->wv_in, pos_save);
-                }
-            }
-            else
-                wpc->total_samples = wps->wphdr.total_samples;
-        }
-
-        if (wpc->wvc_in && wps->wphdr.block_samples && (wps->wphdr.flags & HYBRID_FLAG)) {
-            wpc->file2len = wpc->reader->get_length (wpc->wvc_in);
-            wpc->wvc_flag = TRUE;
-        }
-
-        if (wpc->wvc_flag && !read_wvc_block (wpc)) {
-            if (error) strcpy (error, "not compatible with this version of correction file!");
-            return WavpackCloseFile (wpc);
-        }
-
-        if (!wps->init_done && !unpack_init (wpc)) {
-            if (error) strcpy (error, wpc->error_message [0] ? wpc->error_message :
-                "not compatible with this version of WavPack file!");
-
-            return WavpackCloseFile (wpc);
-        }
-
-        wps->init_done = TRUE;
-    }
-
-    wpc->config.flags &= ~0xff;
-    wpc->config.flags |= wps->wphdr.flags & 0xff;
-    wpc->config.bytes_per_sample = (wps->wphdr.flags & BYTES_STORED) + 1;
-    wpc->config.float_norm_exp = wps->float_norm_exp;
-
-    wpc->config.bits_per_sample = (wpc->config.bytes_per_sample * 8) -
-        ((wps->wphdr.flags & SHIFT_MASK) >> SHIFT_LSB);
-
-    if (!wpc->config.sample_rate) {
-        if (!wps->wphdr.block_samples || (wps->wphdr.flags & SRATE_MASK) == SRATE_MASK)
-            wpc->config.sample_rate = 44100;
-        else
-            wpc->config.sample_rate = sample_rates [(wps->wphdr.flags & SRATE_MASK) >> SRATE_LSB];
-    }
-
-    if (!wpc->config.num_channels) {
-        wpc->config.num_channels = (wps->wphdr.flags & MONO_FLAG) ? 1 : 2;
-        wpc->config.channel_mask = 0x5 - wpc->config.num_channels;
-    }
-
-    if ((flags & OPEN_2CH_MAX) && !(wps->wphdr.flags & FINAL_BLOCK))
-        wpc->reduced_channels = (wps->wphdr.flags & MONO_FLAG) ? 1 : 2;
-
-    return wpc;
-}
-
-// This function obtains general information about an open input file and
-// returns a mask with the following bit values:
-
-// MODE_WVC:  a .wvc file has been found and will be used for lossless
-// MODE_LOSSLESS:  file is lossless (either pure or hybrid)
-// MODE_HYBRID:  file is hybrid mode (either lossy or lossless)
-// MODE_FLOAT:  audio data is 32-bit ieee floating point
-// MODE_VALID_TAG:  file conatins a valid ID3v1 or APEv2 tag
-// MODE_HIGH:  file was created in "high" mode (information only)
-// MODE_FAST:  file was created in "fast" mode (information only)
-// MODE_EXTRA:  file was created using "extra" mode (information only)
-// MODE_APETAG:  file contains a valid APEv2 tag
-// MODE_SFX:  file was created as a "self-extracting" executable
-// MODE_VERY_HIGH:  file was created in the "very high" mode (or in
-//                  the "high" mode prior to 4.4)
-// MODE_MD5:  file contains an MD5 checksum
-// MODE_XMODE:  level used for extra mode (1-6, 0=unknown)
-// MODE_DNS:  dynamic noise shaping
-
-int WavpackGetMode (WavpackContext *wpc)
-{
-    int mode = 0;
-
-    if (wpc) {
-        if (wpc->config.flags & CONFIG_HYBRID_FLAG)
-            mode |= MODE_HYBRID;
-        else if (!(wpc->config.flags & CONFIG_LOSSY_MODE))
-            mode |= MODE_LOSSLESS;
-
-        if (wpc->wvc_flag)
-            mode |= (MODE_LOSSLESS | MODE_WVC);
-
-        if (wpc->lossy_blocks)
-            mode &= ~MODE_LOSSLESS;
-
-        if (wpc->config.flags & CONFIG_FLOAT_DATA)
-            mode |= MODE_FLOAT;
-
-        if (wpc->config.flags & (CONFIG_HIGH_FLAG | CONFIG_VERY_HIGH_FLAG)) {
-            mode |= MODE_HIGH;
-
-            if ((wpc->config.flags & CONFIG_VERY_HIGH_FLAG) ||
-                (wpc->streams && wpc->streams [0] && wpc->streams [0]->wphdr.version < 0x405))
-                    mode |= MODE_VERY_HIGH;
-        }
-
-        if (wpc->config.flags & CONFIG_FAST_FLAG)
-            mode |= MODE_FAST;
-
-        if (wpc->config.flags & CONFIG_EXTRA_MODE)
-            mode |= (MODE_EXTRA | (wpc->config.xmode << 12));
-
-        if (wpc->config.flags & CONFIG_CREATE_EXE)
-            mode |= MODE_SFX;
-
-        if (wpc->config.flags & CONFIG_MD5_CHECKSUM)
-            mode |= MODE_MD5;
-
-        if ((wpc->config.flags & CONFIG_HYBRID_FLAG) && (wpc->config.flags & CONFIG_DYNAMIC_SHAPING) &&
-            wpc->streams && wpc->streams [0] && wpc->streams [0]->wphdr.version >= 0x407)
-                mode |= MODE_DNS;
-
-#ifndef NO_TAGS
-        if (valid_tag (&wpc->m_tag)) {
-            mode |= MODE_VALID_TAG;
-
-            if (valid_tag (&wpc->m_tag) == 'A')
-                mode |= MODE_APETAG;
-        }
-#endif
-    }
-
-    return mode;
-}
-
-// This function returns the major version number of the WavPack program
-// (or library) that created the open file. Currently, this can be 1 to 4.
-// Minor versions are not recorded in WavPack files.
-
-int WavpackGetVersion (WavpackContext *wpc)
-{
-    if (wpc) {
-#ifndef VER4_ONLY
-        if (wpc->stream3)
-            return get_version3 (wpc);
-#endif
-        return 4;
-    }
-
-    return 0;
-}
-
-#endif
-
-// This function returns a pointer to a string describing the last error
-// generated by WavPack.
-
-char *WavpackGetErrorMessage (WavpackContext *wpc)
-{
-    return wpc->error_message;
-}
-
-#ifndef NO_UNPACK
-
-// Unpack the specified number of samples from the current file position.
-// Note that "samples" here refers to "complete" samples, which would be
-// 2 longs for stereo files or even more for multichannel files, so the
-// required memory at "buffer" is 4 * samples * num_channels bytes. The
-// audio data is returned right-justified in 32-bit longs in the endian
-// mode native to the executing processor. So, if the original data was
-// 16-bit, then the values returned would be +/-32k. Floating point data
-// can also be returned if the source was floating point data (and this
-// can be optionally normalized to +/-1.0 by using the appropriate flag
-// in the call to WavpackOpenFileInput ()). The actual number of samples
-// unpacked is returned, which should be equal to the number requested unless
-// the end of fle is encountered or an error occurs. After all samples have
-// been unpacked then 0 will be returned.
-
-uint32_t WavpackUnpackSamples (WavpackContext *wpc, int32_t *buffer, uint32_t samples)
-{
-    WavpackStream *wps = wpc->streams ? wpc->streams [wpc->current_stream = 0] : NULL;
-    uint32_t bcount, samples_unpacked = 0, samples_to_unpack;
-    int num_channels = wpc->config.num_channels;
-    int file_done = FALSE;
-
-#ifndef VER4_ONLY
-    if (wpc->stream3)
-        return unpack_samples3 (wpc, buffer, samples);
-#endif
-
-    while (samples) {
-        if (!wps->wphdr.block_samples || !(wps->wphdr.flags & INITIAL_BLOCK) ||
-            wps->sample_index >= wps->wphdr.block_index + wps->wphdr.block_samples) {
-
-                uint32_t nexthdrpos;
-
-                if (wpc->wrapper_bytes >= MAX_WRAPPER_BYTES)
-                    break;
-
-                free_streams (wpc);
-                nexthdrpos = wpc->reader->get_pos (wpc->wv_in);
-                bcount = read_next_header (wpc->reader, wpc->wv_in, &wps->wphdr);
-
-                if (bcount == (uint32_t) -1)
-                    break;
-
-                wpc->filepos = nexthdrpos;
-
-                if (wpc->open_flags & OPEN_STREAMING)
-                    wps->wphdr.block_index = wps->sample_index = 0;
-                else
-                    wps->wphdr.block_index -= wpc->initial_index;
-
-                wpc->filepos += bcount;
-                wps->blockbuff = malloc (wps->wphdr.ckSize + 8);
-                memcpy (wps->blockbuff, &wps->wphdr, 32);
-
-                if (wpc->reader->read_bytes (wpc->wv_in, wps->blockbuff + 32, wps->wphdr.ckSize - 24) !=
-                    wps->wphdr.ckSize - 24) {
-                        strcpy (wpc->error_message, "can't read all of last block!");
-                        wps->wphdr.block_samples = 0;
-                        wps->wphdr.ckSize = 24;
-                        break;
-                }
-
-                wps->init_done = FALSE;
-
-                if (wps->wphdr.block_samples && wps->sample_index != wps->wphdr.block_index)
-                    wpc->crc_errors++;
-
-                if (wps->wphdr.block_samples && wpc->wvc_flag)
-                    read_wvc_block (wpc);
-
-                if (!wps->wphdr.block_samples) {
-                    if (!wps->init_done && !unpack_init (wpc))
-                        wpc->crc_errors++;
-
-                    wps->init_done = TRUE;
-                }
-        }
-
-        if (!wps->wphdr.block_samples || !(wps->wphdr.flags & INITIAL_BLOCK) ||
-            wps->sample_index >= wps->wphdr.block_index + wps->wphdr.block_samples)
-                continue;
-
-        if (wps->sample_index < wps->wphdr.block_index) {
-            samples_to_unpack = wps->wphdr.block_index - wps->sample_index;
-
-            if (samples_to_unpack > 262144) {
-                strcpy (wpc->error_message, "discontinuity found, aborting file!");
-                wps->wphdr.block_samples = 0;
-                wps->wphdr.ckSize = 24;
-                break;
-            }
-
-            if (samples_to_unpack > samples)
-                samples_to_unpack = samples;
-
-            wps->sample_index += samples_to_unpack;
-            samples_unpacked += samples_to_unpack;
-            samples -= samples_to_unpack;
-
-            if (wpc->reduced_channels)
-                samples_to_unpack *= wpc->reduced_channels;
-            else
-                samples_to_unpack *= num_channels;
-
-            while (samples_to_unpack--)
-                *buffer++ = 0;
-
-            continue;
-        }
-
-        samples_to_unpack = wps->wphdr.block_index + wps->wphdr.block_samples - wps->sample_index;
-
-        if (samples_to_unpack > samples)
-            samples_to_unpack = samples;
-
-        if (!wps->init_done && !unpack_init (wpc))
-            wpc->crc_errors++;
-
-        wps->init_done = TRUE;
-
-        if (!wpc->reduced_channels && !(wps->wphdr.flags & FINAL_BLOCK)) {
-            int32_t *temp_buffer = malloc (samples_to_unpack * 8), *src, *dst;
-            int offset = 0;
-            uint32_t samcnt;
-
-            while (1) {
-                if (wpc->current_stream == wpc->num_streams) {
-                    wpc->streams = realloc (wpc->streams, (wpc->num_streams + 1) * sizeof (wpc->streams [0]));
-                    wps = wpc->streams [wpc->num_streams++] = malloc (sizeof (WavpackStream));
-                    CLEAR (*wps);
-                    bcount = read_next_header (wpc->reader, wpc->wv_in, &wps->wphdr);
-
-                    if (bcount == (uint32_t) -1) {
-                        wpc->streams [0]->wphdr.block_samples = 0;
-                        wpc->streams [0]->wphdr.ckSize = 24;
-                        file_done = TRUE;
-                        break;
-                    }
-
-                    if (wpc->open_flags & OPEN_STREAMING)
-                        wps->wphdr.block_index = wps->sample_index = 0;
-                    else
-                        wps->wphdr.block_index -= wpc->initial_index;
-
-                    wps->blockbuff = malloc (wps->wphdr.ckSize + 8);
-                    memcpy (wps->blockbuff, &wps->wphdr, 32);
-
-                    if (wpc->reader->read_bytes (wpc->wv_in, wps->blockbuff + 32, wps->wphdr.ckSize - 24) !=
-                        wps->wphdr.ckSize - 24) {
-                            wpc->streams [0]->wphdr.block_samples = 0;
-                            wpc->streams [0]->wphdr.ckSize = 24;
-                            file_done = TRUE;
-                            break;
-                    }
-
-                    wps->init_done = FALSE;
-
-                    if (wpc->wvc_flag)
-                        read_wvc_block (wpc);
-
-                    if (!wps->init_done && !unpack_init (wpc))
-                        wpc->crc_errors++;
-
-                    wps->init_done = TRUE;
-                }
-                else
-                    wps = wpc->streams [wpc->current_stream];
-
-                unpack_samples (wpc, src = temp_buffer, samples_to_unpack);
-                samcnt = samples_to_unpack;
-                dst = buffer + offset;
-
-                if (wps->wphdr.flags & MONO_FLAG) {
-                    while (samcnt--) {
-                        dst [0] = *src++;
-                        dst += num_channels;
-                    }
-
-                    offset++;
-                }
-                else if (offset == num_channels - 1) {
-                    while (samcnt--) {
-                        dst [0] = src [0];
-                        dst += num_channels;
-                        src += 2;
-                    }
-
-                    wpc->crc_errors++;
-                    offset++;
-                }
-                else {
-                    while (samcnt--) {
-                        dst [0] = *src++;
-                        dst [1] = *src++;
-                        dst += num_channels;
-                    }
-
-                    offset += 2;
-                }
-
-                if ((wps->wphdr.flags & FINAL_BLOCK) || wpc->current_stream == wpc->max_streams - 1 || offset == num_channels)
-                    break;
-                else
-                    wpc->current_stream++;
-            }
-
-            wps = wpc->streams [wpc->current_stream = 0];
-            free (temp_buffer);
-        }
-        else
-            unpack_samples (wpc, buffer, samples_to_unpack);
-
-        if (file_done) {
-            strcpy (wpc->error_message, "can't read all of last block!");
-            break;
-        }
-
-        if (wpc->reduced_channels)
-            buffer += samples_to_unpack * wpc->reduced_channels;
-        else
-            buffer += samples_to_unpack * num_channels;
-
-        samples_unpacked += samples_to_unpack;
-        samples -= samples_to_unpack;
-
-        if (wps->sample_index == wps->wphdr.block_index + wps->wphdr.block_samples) {
-            if (check_crc_error (wpc) && wps->blockbuff) {
-
-                if (wpc->reader->can_seek (wpc->wv_in)) {
-                    int32_t rseek = ((WavpackHeader *) wps->blockbuff)->ckSize / 3;
-                    wpc->reader->set_pos_rel (wpc->wv_in, (rseek > 16384) ? -16384 : -rseek, SEEK_CUR);
-                }
-
-                if (wpc->wvc_flag && wps->block2buff && wpc->reader->can_seek (wpc->wvc_in)) {
-                    int32_t rseek = ((WavpackHeader *) wps->block2buff)->ckSize / 3;
-                    wpc->reader->set_pos_rel (wpc->wvc_in, (rseek > 16384) ? -16384 : -rseek, SEEK_CUR);
-                }
-
-                wpc->crc_errors++;
-            }
-        }
-
-        if (wpc->total_samples != (uint32_t) -1 && wps->sample_index == wpc->total_samples)
-            break;
-    }
-
-    return samples_unpacked;
-}
-
-#ifndef NO_SEEKING
-
-static uint32_t find_sample (WavpackContext *wpc, void *infile, uint32_t header_pos, uint32_t sample);
-
-// Seek to the specifed sample index, returning TRUE on success. Note that
-// files generated with version 4.0 or newer will seek almost immediately.
-// Older files can take quite long if required to seek through unplayed
-// portions of the file, but will create a seek map so that reverse seeks
-// (or forward seeks to already scanned areas) will be very fast. After a
-// FALSE return the file should not be accessed again (other than to close
-// it); this is a fatal error.
-
-int WavpackSeekSample (WavpackContext *wpc, uint32_t sample)
-{
-    WavpackStream *wps = wpc->streams ? wpc->streams [wpc->current_stream = 0] : NULL;
-    uint32_t bcount, samples_to_skip;
-    int32_t *buffer;
-
-    if (wpc->total_samples == (uint32_t) -1 || sample >= wpc->total_samples ||
-        !wpc->reader->can_seek (wpc->wv_in) || (wpc->open_flags & OPEN_STREAMING) ||
-        (wpc->wvc_flag && !wpc->reader->can_seek (wpc->wvc_in)))
-            return FALSE;
-
-#ifndef VER4_ONLY
-    if (wpc->stream3)
-        return seek_sample3 (wpc, sample);
-#endif
-
-    if (!wps->wphdr.block_samples || !(wps->wphdr.flags & INITIAL_BLOCK) || sample < wps->wphdr.block_index ||
-        sample >= wps->wphdr.block_index + wps->wphdr.block_samples) {
-
-            free_streams (wpc);
-            wpc->filepos = find_sample (wpc, wpc->wv_in, wpc->filepos, sample);
-
-            if (wpc->filepos == (uint32_t) -1)
-                return FALSE;
-
-            if (wpc->wvc_flag) {
-                wpc->file2pos = find_sample (wpc, wpc->wvc_in, 0, sample);
-
-                if (wpc->file2pos == (uint32_t) -1)
-                    return FALSE;
-            }
-    }
-
-    if (!wps->blockbuff) {
-        wpc->reader->set_pos_abs (wpc->wv_in, wpc->filepos);
-        wpc->reader->read_bytes (wpc->wv_in, &wps->wphdr, sizeof (WavpackHeader));
-        little_endian_to_native (&wps->wphdr, WavpackHeaderFormat);
-        wps->wphdr.block_index -= wpc->initial_index;
-        wps->blockbuff = malloc (wps->wphdr.ckSize + 8);
-        memcpy (wps->blockbuff, &wps->wphdr, sizeof (WavpackHeader));
-
-        if (wpc->reader->read_bytes (wpc->wv_in, wps->blockbuff + sizeof (WavpackHeader), wps->wphdr.ckSize - 24) !=
-            wps->wphdr.ckSize - 24) {
-                free_streams (wpc);
-                return FALSE;
-        }
-
-        wps->init_done = FALSE;
-
-        if (wpc->wvc_flag) {
-            wpc->reader->set_pos_abs (wpc->wvc_in, wpc->file2pos);
-            wpc->reader->read_bytes (wpc->wvc_in, &wps->wphdr, sizeof (WavpackHeader));
-            little_endian_to_native (&wps->wphdr, WavpackHeaderFormat);
-            wps->wphdr.block_index -= wpc->initial_index;
-            wps->block2buff = malloc (wps->wphdr.ckSize + 8);
-            memcpy (wps->block2buff, &wps->wphdr, sizeof (WavpackHeader));
-
-            if (wpc->reader->read_bytes (wpc->wvc_in, wps->block2buff + sizeof (WavpackHeader), wps->wphdr.ckSize - 24) !=
-                wps->wphdr.ckSize - 24) {
-                    free_streams (wpc);
-                    return FALSE;
-            }
-        }
-
-        if (!wps->init_done && !unpack_init (wpc)) {
-            free_streams (wpc);
-            return FALSE;
-        }
-
-        wps->init_done = TRUE;
-    }
-
-    while (!wpc->reduced_channels && !(wps->wphdr.flags & FINAL_BLOCK)) {
-        if (++wpc->current_stream == wpc->num_streams) {
-
-            if (wpc->num_streams == wpc->max_streams) {
-                free_streams (wpc);
-                return FALSE;
-            }
-
-            wpc->streams = realloc (wpc->streams, (wpc->num_streams + 1) * sizeof (wpc->streams [0]));
-            wps = wpc->streams [wpc->num_streams++] = malloc (sizeof (WavpackStream));
-            CLEAR (*wps);
-            bcount = read_next_header (wpc->reader, wpc->wv_in, &wps->wphdr);
-
-            if (bcount == (uint32_t) -1) {
-                free_streams (wpc);
-                return FALSE;
-            }
-
-            wps->blockbuff = malloc (wps->wphdr.ckSize + 8);
-            memcpy (wps->blockbuff, &wps->wphdr, 32);
-
-            if (wpc->reader->read_bytes (wpc->wv_in, wps->blockbuff + 32, wps->wphdr.ckSize - 24) !=
-                wps->wphdr.ckSize - 24) {
-                    free_streams (wpc);
-                    return FALSE;
-            }
-
-            wps->init_done = FALSE;
-
-            if (wpc->wvc_flag && !read_wvc_block (wpc)) {
-                free_streams (wpc);
-                return FALSE;
-            }
-
-            if (!wps->init_done && !unpack_init (wpc)) {
-                free_streams (wpc);
-                return FALSE;
-            }
-
-            wps->init_done = TRUE;
-        }
-        else
-            wps = wpc->streams [wpc->current_stream];
-    }
-
-    if (sample < wps->sample_index) {
-        for (wpc->current_stream = 0; wpc->current_stream < wpc->num_streams; wpc->current_stream++)
-            if (!unpack_init (wpc))
-                return FALSE;
-            else
-                wpc->streams [wpc->current_stream]->init_done = TRUE;
-    }
-
-    samples_to_skip = sample - wps->sample_index;
-
-    if (samples_to_skip > 131072) {
-        free_streams (wpc);
-        return FALSE;
-    }
-
-    if (samples_to_skip) {
-        buffer = malloc (samples_to_skip * 8);
-
-        for (wpc->current_stream = 0; wpc->current_stream < wpc->num_streams; wpc->current_stream++)
-            unpack_samples (wpc, buffer, samples_to_skip);
-
-        free (buffer);
-    }
-
-    wpc->current_stream = 0;
-    return TRUE;
-}
-
-#endif
-
-#endif
-
-#ifndef NO_PACK
-
-// Open context for writing WavPack files. The returned context pointer is used
-// in all following calls to the library. The "blockout" function will be used
-// to store the actual completed WavPack blocks and will be called with the id
-// pointers containing user defined data (one for the wv file and one for the
-// wvc file). A return value of NULL indicates that memory could not be
-// allocated for the context.
-
-WavpackContext *WavpackOpenFileOutput (WavpackBlockOutput blockout, void *wv_id, void *wvc_id)
-{
-    WavpackContext *wpc = malloc (sizeof (WavpackContext));
-
-    if (!wpc)
-        return NULL;
-
-    CLEAR (*wpc);
-    wpc->blockout = blockout;
-    wpc->wv_out = wv_id;
-    wpc->wvc_out = wvc_id;
-    return wpc;
-}
-
-// Set configuration for writing WavPack files. This must be done before
-// sending any actual samples, however it is okay to send wrapper or other
-// metadata before calling this. The "config" structure contains the following
-// required information:
-
-// config->bytes_per_sample     see WavpackGetBytesPerSample() for info
-// config->bits_per_sample      see WavpackGetBitsPerSample() for info
-// config->channel_mask         Microsoft standard (mono = 4, stereo = 3)
-// config->num_channels         self evident
-// config->sample_rate          self evident
-
-// In addition, the following fields and flags may be set:
-
-// config->flags:
-// --------------
-// o CONFIG_HYBRID_FLAG         select hybrid mode (must set bitrate)
-// o CONFIG_JOINT_STEREO        select joint stereo (must set override also)
-// o CONFIG_JOINT_OVERRIDE      override default joint stereo selection
-// o CONFIG_HYBRID_SHAPE        select hybrid noise shaping (set override &
-//                                                      shaping_weight != 0.0)
-// o CONFIG_SHAPE_OVERRIDE      override default hybrid noise shaping
-//                               (set CONFIG_HYBRID_SHAPE and shaping_weight)
-// o CONFIG_FAST_FLAG           "fast" compression mode
-// o CONFIG_HIGH_FLAG           "high" compression mode
-// o CONFIG_BITRATE_KBPS        hybrid bitrate is kbps, not bits / sample
-// o CONFIG_CREATE_WVC          create correction file
-// o CONFIG_OPTIMIZE_WVC        maximize bybrid compression (-cc option)
-// o CONFIG_CALC_NOISE          calc noise in hybrid mode
-// o CONFIG_EXTRA_MODE          extra processing mode (slow!)
-// o CONFIG_SKIP_WVX            no wvx stream for floats & large ints
-// o CONFIG_MD5_CHECKSUM        specify if you plan to store MD5 signature
-// o CONFIG_CREATE_EXE          specify if you plan to prepend sfx module
-// o CONFIG_OPTIMIZE_MONO       detect and optimize for mono files posing as
-//                               stereo (uses a more recent stream format that
-//                               is not compatible with decoders < 4.3)
-
-// config->bitrate              hybrid bitrate in either bits/sample or kbps
-// config->shaping_weight       hybrid noise shaping coefficient override
-// config->block_samples        force samples per WavPack block (0 = use deflt)
-// config->float_norm_exp       select floating-point data (127 for +/-1.0)
-// config->xmode                extra mode processing value override
-
-// If the number of samples to be written is known then it should be passed
-// here. If the duration is not known then pass -1. In the case that the size
-// is not known (or the writing is terminated early) then it is suggested that
-// the application retrieve the first block written and let the library update
-// the total samples indication. A function is provided to do this update and
-// it should be done to the "correction" file also. If this cannot be done
-// (because a pipe is being used, for instance) then a valid WavPack will still
-// be created, but when applications want to access that file they will have
-// to seek all the way to the end to determine the actual duration. Also, if
-// a RIFF header has been included then it should be updated as well or the
-// WavPack file will not be directly unpackable to a valid wav file (although
-// it will still be usable by itself). A return of FALSE indicates an error.
-
-int WavpackSetConfiguration (WavpackContext *wpc, WavpackConfig *config, uint32_t total_samples)
-{
-    uint32_t flags = (config->bytes_per_sample - 1), bps = 0, shift = 0;
-    uint32_t chan_mask = config->channel_mask;
-    int num_chans = config->num_channels;
-    int i;
-
-    wpc->total_samples = total_samples;
-    wpc->config.sample_rate = config->sample_rate;
-    wpc->config.num_channels = config->num_channels;
-    wpc->config.channel_mask = config->channel_mask;
-    wpc->config.bits_per_sample = config->bits_per_sample;
-    wpc->config.bytes_per_sample = config->bytes_per_sample;
-    wpc->config.block_samples = config->block_samples;
-    wpc->config.flags = config->flags;
-
-    if (config->flags & CONFIG_VERY_HIGH_FLAG)
-        wpc->config.flags |= CONFIG_HIGH_FLAG;
-
-    if (config->float_norm_exp) {
-        wpc->config.float_norm_exp = config->float_norm_exp;
-        wpc->config.flags |= CONFIG_FLOAT_DATA;
-        flags |= FLOAT_DATA;
-    }
-    else
-        shift = (config->bytes_per_sample * 8) - config->bits_per_sample;
-
-    for (i = 0; i < 15; ++i)
-        if (wpc->config.sample_rate == sample_rates [i])
-            break;
-
-    flags |= i << SRATE_LSB;
-    flags |= shift << SHIFT_LSB;
-
-    if (config->flags & CONFIG_HYBRID_FLAG) {
-        flags |= HYBRID_FLAG | HYBRID_BITRATE | HYBRID_BALANCE;
-
-        if (!(wpc->config.flags & CONFIG_SHAPE_OVERRIDE)) {
-            wpc->config.flags |= CONFIG_HYBRID_SHAPE | CONFIG_AUTO_SHAPING;
-            flags |= HYBRID_SHAPE | NEW_SHAPING;
-        }
-        else if (wpc->config.flags & CONFIG_HYBRID_SHAPE) {
-            wpc->config.shaping_weight = config->shaping_weight;
-            flags |= HYBRID_SHAPE | NEW_SHAPING;
-        }
-
-        if (wpc->config.flags & CONFIG_OPTIMIZE_WVC)
-            flags |= CROSS_DECORR;
-
-        if (config->flags & CONFIG_BITRATE_KBPS) {
-            bps = (uint32_t) floor (config->bitrate * 256000.0 / config->sample_rate / config->num_channels + 0.5);
-
-            if (bps > (64 << 8))
-                bps = 64 << 8;
-        }
-        else
-            bps = (uint32_t) floor (config->bitrate * 256.0 + 0.5);
-    }
-    else
-        flags |= CROSS_DECORR;
-
-    if (!(config->flags & CONFIG_JOINT_OVERRIDE) || (config->flags & CONFIG_JOINT_STEREO))
-        flags |= JOINT_STEREO;
-
-    if (config->flags & CONFIG_CREATE_WVC)
-        wpc->wvc_flag = TRUE;
-
-    wpc->stream_version = (config->flags & CONFIG_OPTIMIZE_MONO) ? MAX_STREAM_VERS : CUR_STREAM_VERS;
-
-    for (wpc->current_stream = 0; num_chans; wpc->current_stream++) {
-        WavpackStream *wps = malloc (sizeof (WavpackStream));
-        uint32_t stereo_mask, mono_mask;
-        int pos, chans = 0;
-
-        wpc->streams = realloc (wpc->streams, (wpc->current_stream + 1) * sizeof (wpc->streams [0]));
-        wpc->streams [wpc->current_stream] = wps;
-        CLEAR (*wps);
-
-        for (pos = 1; pos <= 18; ++pos) {
-            stereo_mask = 3 << (pos - 1);
-            mono_mask = 1 << (pos - 1);
-
-            if ((chan_mask & stereo_mask) == stereo_mask && (mono_mask & 0x251)) {
-                chan_mask &= ~stereo_mask;
-                chans = 2;
-                break;
-            }
-            else if (chan_mask & mono_mask) {
-                chan_mask &= ~mono_mask;
-                chans = 1;
-                break;
-            }
-        }
-
-        if (!chans) {
-            if (config->flags & CONFIG_PAIR_UNDEF_CHANS)
-                chans = num_chans > 1 ? 2 : 1;
-            else
-                chans = 1;
-        }
-
-        num_chans -= chans;
-
-        if (num_chans && wpc->current_stream == NEW_MAX_STREAMS - 1)
-            break;
-
-        memcpy (wps->wphdr.ckID, "wvpk", 4);
-        wps->wphdr.ckSize = sizeof (WavpackHeader) - 8;
-        wps->wphdr.total_samples = wpc->total_samples;
-        wps->wphdr.version = wpc->stream_version;
-        wps->wphdr.flags = flags;
-        wps->bits = bps;
-
-        if (!wpc->current_stream)
-            wps->wphdr.flags |= INITIAL_BLOCK;
-
-        if (!num_chans)
-            wps->wphdr.flags |= FINAL_BLOCK;
-
-        if (chans == 1) {
-            wps->wphdr.flags &= ~(JOINT_STEREO | CROSS_DECORR | HYBRID_BALANCE);
-            wps->wphdr.flags |= MONO_FLAG;
-        }
-    }
-
-    wpc->num_streams = wpc->current_stream;
-    wpc->current_stream = 0;
-
-    if (num_chans) {
-        strcpy (wpc->error_message, "too many channels!");
-        return FALSE;
-    }
-
-    if (config->flags & CONFIG_EXTRA_MODE)
-        wpc->config.xmode = config->xmode ? config->xmode : 1;
-
-    return TRUE;
-}
-
-// Prepare to actually pack samples by determining the size of the WavPack
-// blocks and allocating sample buffers and initializing each stream. Call
-// after WavpackSetConfiguration() and before WavpackPackSamples(). A return
-// of FALSE indicates an error.
-
-int WavpackPackInit (WavpackContext *wpc)
-{
-    if (wpc->metabytes > 16384)             // 16384 bytes still leaves plenty of room for audio
-        write_metadata_block (wpc);         //  in this block (otherwise write a special one)
-
-    if (wpc->config.flags & CONFIG_HIGH_FLAG)
-        wpc->block_samples = wpc->config.sample_rate;
-    else if (!(wpc->config.sample_rate % 2))
-        wpc->block_samples = wpc->config.sample_rate / 2;
-    else
-        wpc->block_samples = wpc->config.sample_rate;
-
-    while (wpc->block_samples * wpc->config.num_channels > 150000)
-        wpc->block_samples /= 2;
-
-    while (wpc->block_samples * wpc->config.num_channels < 40000)
-        wpc->block_samples *= 2;
-
-    if (wpc->config.block_samples) {
-        if ((wpc->config.flags & CONFIG_MERGE_BLOCKS) &&
-            wpc->block_samples > (uint32_t) wpc->config.block_samples) {
-                wpc->block_boundary = wpc->config.block_samples;
-                wpc->block_samples /= wpc->config.block_samples;
-                wpc->block_samples *= wpc->config.block_samples;
-        }
-        else
-            wpc->block_samples = wpc->config.block_samples;
-    }
-
-    wpc->ave_block_samples = wpc->block_samples;
-    wpc->max_samples = wpc->block_samples + (wpc->block_samples >> 1);
-
-    for (wpc->current_stream = 0; wpc->current_stream < wpc->num_streams; wpc->current_stream++) {
-        WavpackStream *wps = wpc->streams [wpc->current_stream];
-
-        wps->sample_buffer = malloc (wpc->max_samples * (wps->wphdr.flags & MONO_FLAG ? 4 : 8));
-        pack_init (wpc);
-    }
-
-    return TRUE;
-}
-
-// Pack the specified samples. Samples must be stored in longs in the native
-// endian format of the executing processor. The number of samples specified
-// indicates composite samples (sometimes called "frames"). So, the actual
-// number of data points would be this "sample_count" times the number of
-// channels. Note that samples are accumulated here until enough exist to
-// create a complete WavPack block (or several blocks for multichannel audio).
-// If an application wants to break a block at a specific sample, then it must
-// simply call WavpackFlushSamples() to force an early termination. Completed
-// WavPack blocks are send to the function provided in the initial call to
-// WavpackOpenFileOutput(). A return of FALSE indicates an error.
-
-static int pack_streams (WavpackContext *wpc, uint32_t block_samples);
-static int create_riff_header (WavpackContext *wpc);
-
-int WavpackPackSamples (WavpackContext *wpc, int32_t *sample_buffer, uint32_t sample_count)
-{
-    int nch = wpc->config.num_channels;
-
-    while (sample_count) {
-        int32_t *source_pointer = sample_buffer;
-        unsigned int samples_to_copy;
-
-        if (!wpc->riff_header_added && !wpc->riff_header_created && !create_riff_header (wpc))
-            return FALSE;
-
-        if (wpc->acc_samples + sample_count > wpc->max_samples)
-            samples_to_copy = wpc->max_samples - wpc->acc_samples;
-        else
-            samples_to_copy = sample_count;
-
-        for (wpc->current_stream = 0; wpc->current_stream < wpc->num_streams; wpc->current_stream++) {
-            WavpackStream *wps = wpc->streams [wpc->current_stream];
-            int32_t *dptr, *sptr, cnt;
-
-            dptr = wps->sample_buffer + wpc->acc_samples * (wps->wphdr.flags & MONO_FLAG ? 1 : 2);
-            sptr = source_pointer;
-            cnt = samples_to_copy;
-
-            if (wps->wphdr.flags & MONO_FLAG) {
-                while (cnt--) {
-                    *dptr++ = *sptr;
-                    sptr += nch;
-                }
-
-                source_pointer++;
-            }
-            else {
-                while (cnt--) {
-                    *dptr++ = sptr [0];
-                    *dptr++ = sptr [1];
-                    sptr += nch;
-                }
-
-                source_pointer += 2;
-            }
-        }
-
-        sample_buffer += samples_to_copy * nch;
-        sample_count -= samples_to_copy;
-
-        if ((wpc->acc_samples += samples_to_copy) == wpc->max_samples &&
-            !pack_streams (wpc, wpc->block_samples))
-                return FALSE;
-    }
-
-    return TRUE;
-}
-
-// Flush all accumulated samples into WavPack blocks. This is normally called
-// after all samples have been sent to WavpackPackSamples(), but can also be
-// called to terminate a WavPack block at a specific sample (in other words it
-// is possible to continue after this operation). This is also called to
-// dump non-audio blocks like those holding metadata for various purposes.
-// A return of FALSE indicates an error.
-
-int WavpackFlushSamples (WavpackContext *wpc)
-{
-    while (wpc->acc_samples) {
-        uint32_t block_samples;
-
-        if (wpc->acc_samples > wpc->block_samples)
-            block_samples = wpc->acc_samples / 2;
-        else
-            block_samples = wpc->acc_samples;
-
-        if (!pack_streams (wpc, block_samples))
-            return FALSE;
-    }
-
-    if (wpc->metacount)
-        write_metadata_block (wpc);
-
-    return TRUE;
-}
-
-// Note: The following function is no longer required because a proper wav
-// header is now automatically generated for the application. However, if the
-// application wants to generate its own header or wants to include additional
-// chunks, then this function can still be used in which case the automatic
-// wav header generation is suppressed.
-
-// Add wrapper (currently RIFF only) to WavPack blocks. This should be called
-// before sending any audio samples for the RIFF header or after all samples
-// have been sent for any RIFF trailer. WavpackFlushSamples() should be called
-// between sending the last samples and calling this for trailer data to make
-// sure that headers and trailers don't get mixed up in very short files. If
-// the exact contents of the RIFF header are not known because, for example,
-// the file duration is uncertain or trailing chunks are possible, simply write
-// a "dummy" header of the correct length. When all data has been written it
-// will be possible to read the first block written and update the header
-// directly. An example of this can be found in the Audition filter. A
-// return of FALSE indicates an error.
-
-int WavpackAddWrapper (WavpackContext *wpc, void *data, uint32_t bcount)
-{
-    uint32_t index = WavpackGetSampleIndex (wpc);
-    unsigned char meta_id;
-
-    if (!index || index == (uint32_t) -1) {
-        wpc->riff_header_added = TRUE;
-        meta_id = ID_RIFF_HEADER;
-    }
-    else {
-        wpc->riff_trailer_bytes += bcount;
-        meta_id = ID_RIFF_TRAILER;
-    }
-
-    return add_to_metadata (wpc, data, bcount, meta_id);
-}
-
-// Store computed MD5 sum in WavPack metadata. Note that the user must compute
-// the 16 byte sum; it is not done here. A return of FALSE indicates an error.
-
-int WavpackStoreMD5Sum (WavpackContext *wpc, unsigned char data [16])
-{
-    return add_to_metadata (wpc, data, 16, ID_MD5_CHECKSUM);
-}
-
-static int create_riff_header (WavpackContext *wpc)
-{
-    RiffChunkHeader riffhdr;
-    ChunkHeader datahdr, fmthdr;
-    WaveHeader wavhdr;
-
-    uint32_t total_samples = wpc->total_samples, total_data_bytes;
-    int32_t channel_mask = wpc->config.channel_mask;
-    int32_t sample_rate = wpc->config.sample_rate;
-    int bytes_per_sample = wpc->config.bytes_per_sample;
-    int bits_per_sample = wpc->config.bits_per_sample;
-    int format = (wpc->config.float_norm_exp) ? 3 : 1;
-    int num_channels = wpc->config.num_channels;
-    int wavhdrsize = 16;
-
-    wpc->riff_header_created = TRUE;
-
-    if (format == 3 && wpc->config.float_norm_exp != 127) {
-        strcpy (wpc->error_message, "can't create valid RIFF wav header for non-normalized floating data!");
-        return FALSE;
-    }
-
-    if (total_samples == (uint32_t) -1)
-        total_samples = 0x7ffff000 / (bytes_per_sample * num_channels);
-
-    total_data_bytes = total_samples * bytes_per_sample * num_channels;
-
-    CLEAR (wavhdr);
-
-    wavhdr.FormatTag = format;
-    wavhdr.NumChannels = num_channels;
-    wavhdr.SampleRate = sample_rate;
-    wavhdr.BytesPerSecond = sample_rate * num_channels * bytes_per_sample;
-    wavhdr.BlockAlign = bytes_per_sample * num_channels;
-    wavhdr.BitsPerSample = bits_per_sample;
-
-    if (num_channels > 2 || channel_mask != 0x5 - num_channels) {
-        wavhdrsize = sizeof (wavhdr);
-        wavhdr.cbSize = 22;
-        wavhdr.ValidBitsPerSample = bits_per_sample;
-        wavhdr.SubFormat = format;
-        wavhdr.ChannelMask = channel_mask;
-        wavhdr.FormatTag = 0xfffe;
-        wavhdr.BitsPerSample = bytes_per_sample * 8;
-        wavhdr.GUID [4] = 0x10;
-        wavhdr.GUID [6] = 0x80;
-        wavhdr.GUID [9] = 0xaa;
-        wavhdr.GUID [11] = 0x38;
-        wavhdr.GUID [12] = 0x9b;
-        wavhdr.GUID [13] = 0x71;
-    }
-
-    strncpy (riffhdr.ckID, "RIFF", sizeof (riffhdr.ckID));
-    strncpy (riffhdr.formType, "WAVE", sizeof (riffhdr.formType));
-    riffhdr.ckSize = sizeof (riffhdr) + wavhdrsize + sizeof (datahdr) + total_data_bytes;
-    strncpy (fmthdr.ckID, "fmt ", sizeof (fmthdr.ckID));
-    fmthdr.ckSize = wavhdrsize;
-
-    strncpy (datahdr.ckID, "data", sizeof (datahdr.ckID));
-    datahdr.ckSize = total_data_bytes;
-
-    // write the RIFF chunks up to just before the data starts
-
-    native_to_little_endian (&riffhdr, ChunkHeaderFormat);
-    native_to_little_endian (&fmthdr, ChunkHeaderFormat);
-    native_to_little_endian (&wavhdr, WaveHeaderFormat);
-    native_to_little_endian (&datahdr, ChunkHeaderFormat);
-
-    return add_to_metadata (wpc, &riffhdr, sizeof (riffhdr), ID_RIFF_HEADER) &&
-        add_to_metadata (wpc, &fmthdr, sizeof (fmthdr), ID_RIFF_HEADER) &&
-        add_to_metadata (wpc, &wavhdr, wavhdrsize, ID_RIFF_HEADER) &&
-        add_to_metadata (wpc, &datahdr, sizeof (datahdr), ID_RIFF_HEADER);
-}
-
-static int pack_streams (WavpackContext *wpc, uint32_t block_samples)
-{
-    uint32_t max_blocksize, bcount;
-    unsigned char *outbuff, *outend, *out2buff, *out2end;
-    int result = TRUE;
-
-    if ((wpc->config.flags & CONFIG_FLOAT_DATA) && !(wpc->config.flags & CONFIG_SKIP_WVX))
-        max_blocksize = block_samples * 16 + 4096;
-    else
-        max_blocksize = block_samples * 10 + 4096;
-
-    out2buff = (wpc->wvc_flag) ? malloc (max_blocksize) : NULL;
-    out2end = out2buff + max_blocksize;
-    outbuff = malloc (max_blocksize);
-    outend = outbuff + max_blocksize;
-
-    for (wpc->current_stream = 0; wpc->current_stream < wpc->num_streams; wpc->current_stream++) {
-        WavpackStream *wps = wpc->streams [wpc->current_stream];
-        uint32_t flags = wps->wphdr.flags;
-
-        flags &= ~MAG_MASK;
-        flags += (1 << MAG_LSB) * ((flags & BYTES_STORED) * 8 + 7);
-
-        wps->wphdr.block_index = wps->sample_index;
-        wps->wphdr.block_samples = block_samples;
-        wps->wphdr.flags = flags;
-        wps->block2buff = out2buff;
-        wps->block2end = out2end;
-        wps->blockbuff = outbuff;
-        wps->blockend = outend;
-
-        result = pack_block (wpc, wps->sample_buffer);
-        wps->blockbuff = wps->block2buff = NULL;
-
-        if (wps->wphdr.block_samples != block_samples)
-            block_samples = wps->wphdr.block_samples;
-
-        if (!result) {
-            strcpy (wpc->error_message, "output buffer overflowed!");
-            break;
-        }
-
-        bcount = ((WavpackHeader *) outbuff)->ckSize + 8;
-        native_to_little_endian ((WavpackHeader *) outbuff, WavpackHeaderFormat);
-        result = wpc->blockout (wpc->wv_out, outbuff, bcount);
-
-        if (!result) {
-            strcpy (wpc->error_message, "can't write WavPack data, disk probably full!");
-            break;
-        }
-
-        wpc->filelen += bcount;
-
-        if (out2buff) {
-            bcount = ((WavpackHeader *) out2buff)->ckSize + 8;
-            native_to_little_endian ((WavpackHeader *) out2buff, WavpackHeaderFormat);
-            result = wpc->blockout (wpc->wvc_out, out2buff, bcount);
-
-            if (!result) {
-                strcpy (wpc->error_message, "can't write WavPack data, disk probably full!");
-                break;
-            }
-
-            wpc->file2len += bcount;
-        }
-
-        if (wpc->acc_samples != block_samples)
-            memmove (wps->sample_buffer, wps->sample_buffer + block_samples * (flags & MONO_FLAG ? 1 : 2),
-                (wpc->acc_samples - block_samples) * sizeof (int32_t) * (flags & MONO_FLAG ? 1 : 2));
-    }
-
-    wpc->current_stream = 0;
-    wpc->ave_block_samples = (wpc->ave_block_samples * 0x7 + block_samples + 0x4) >> 3;
-    wpc->acc_samples -= block_samples;
-    free (outbuff);
-
-    if (out2buff)
-        free (out2buff);
-
-    return result;
-}
-
-// Given the pointer to the first block written (to either a .wv or .wvc file),
-// update the block with the actual number of samples written. If the wav
-// header was generated by the library, then it is updated also. This should
-// be done if WavpackSetConfiguration() was called with an incorrect number
-// of samples (or -1). It is the responsibility of the application to read and
-// rewrite the block. An example of this can be found in the Audition filter.
-
-void WavpackUpdateNumSamples (WavpackContext *wpc, void *first_block)
-{
-    uint32_t wrapper_size;
-
-    little_endian_to_native (first_block, WavpackHeaderFormat);
-    ((WavpackHeader *) first_block)->total_samples = WavpackGetSampleIndex (wpc);
-
-    /* note that since the RIFF wrapper will not necessarily be properly aligned,
-       we copy it into a newly allocated buffer before modifying it */
-
-    if (wpc->riff_header_created) {
-        if (WavpackGetWrapperLocation (first_block, &wrapper_size)) {
-            uint32_t data_size = WavpackGetSampleIndex (wpc) * WavpackGetNumChannels (wpc) * WavpackGetBytesPerSample (wpc);
-            RiffChunkHeader *riffhdr;
-            ChunkHeader *datahdr;
-            void *wrapper_buff;
-
-            riffhdr = wrapper_buff = malloc (wrapper_size);
-            memcpy (wrapper_buff, WavpackGetWrapperLocation (first_block, NULL), wrapper_size);
-            datahdr = (ChunkHeader *)((char *) riffhdr + wrapper_size - sizeof (ChunkHeader));
-
-            if (!strncmp (riffhdr->ckID, "RIFF", 4)) {
-                little_endian_to_native (riffhdr, ChunkHeaderFormat);
-                riffhdr->ckSize = wrapper_size + data_size - 8 + wpc->riff_trailer_bytes;
-                native_to_little_endian (riffhdr, ChunkHeaderFormat);
-            }
-
-            if (!strncmp (datahdr->ckID, "data", 4)) {
-                little_endian_to_native (datahdr, ChunkHeaderFormat);
-                datahdr->ckSize = data_size;
-                native_to_little_endian (datahdr, ChunkHeaderFormat);
-            }
-
-            memcpy (WavpackGetWrapperLocation (first_block, NULL), wrapper_buff, wrapper_size);
-            free (wrapper_buff);
-        }
-    }
-
-    native_to_little_endian (first_block, WavpackHeaderFormat);
-}
-
-// Note: The following function is no longer required because the wav header
-// automatically generated for the application will also be updated by
-// WavpackUpdateNumSamples (). However, if the application wants to generate
-// its own header or wants to include additional chunks, then this function
-// still must be used to update the application generated header.
-
-// Given the pointer to the first block written to a WavPack file, this
-// function returns the location of the stored RIFF header that was originally
-// written with WavpackAddWrapper(). This would normally be used to update
-// the wav header to indicate that a different number of samples was actually
-// written or if additional RIFF chunks are written at the end of the file.
-// The "size" parameter can be set to non-NULL to obtain the exact size of the
-// RIFF header, and the function will return FALSE if the header is not found
-// in the block's metadata (or it is not a valid WavPack block). It is the
-// responsibility of the application to read and rewrite the block. An example
-// of this can be found in the Audition filter.
-
-static void *find_metadata (void *wavpack_block, int desired_id, uint32_t *size);
-
-void *WavpackGetWrapperLocation (void *first_block, uint32_t *size)
-{
-    void *loc;
-
-    little_endian_to_native (first_block, WavpackHeaderFormat);
-    loc = find_metadata (first_block, ID_RIFF_HEADER, size);
-    native_to_little_endian (first_block, WavpackHeaderFormat);
-
-    return loc;
-}
-
-static void *find_metadata (void *wavpack_block, int desired_id, uint32_t *size)
-{
-    WavpackHeader *wphdr = wavpack_block;
-    unsigned char *dp, meta_id, c1, c2;
-    int32_t bcount, meta_bc;
-
-    if (strncmp (wphdr->ckID, "wvpk", 4))
-        return NULL;
-
-    bcount = wphdr->ckSize - sizeof (WavpackHeader) + 8;
-    dp = (unsigned char *)(wphdr + 1);
-
-    while (bcount >= 2) {
-        meta_id = *dp++;
-        c1 = *dp++;
-
-        meta_bc = c1 << 1;
-        bcount -= 2;
-
-        if (meta_id & ID_LARGE) {
-            if (bcount < 2)
-                break;
-
-            c1 = *dp++;
-            c2 = *dp++;
-            meta_bc += ((uint32_t) c1 << 9) + ((uint32_t) c2 << 17);
-            bcount -= 2;
-        }
-
-        if ((meta_id & ID_UNIQUE) == desired_id) {
-            if ((bcount - meta_bc) >= 0) {
-                if (size)
-                    *size = meta_bc - ((meta_id & ID_ODD_SIZE) ? 1 : 0);
-
-                return dp;
-            }
-            else
-                return NULL;
-        }
-
-        bcount -= meta_bc;
-        dp += meta_bc;
-    }
-
-    return NULL;
-}
-
-#endif
-
-// Get total number of samples contained in the WavPack file, or -1 if unknown
-
-uint32_t WavpackGetNumSamples (WavpackContext *wpc)
-{
-    return wpc ? wpc->total_samples : (uint32_t) -1;
-}
-
-// Get the current sample index position, or -1 if unknown
-
-uint32_t WavpackGetSampleIndex (WavpackContext *wpc)
-{
-    if (wpc) {
-#if !defined(VER4_ONLY) && !defined(NO_UNPACK)
-        if (wpc->stream3)
-            return get_sample_index3 (wpc);
-        else if (wpc->streams && wpc->streams [0])
-            return wpc->streams [0]->sample_index;
-#else
-        if (wpc->streams && wpc->streams [0])
-            return wpc->streams [0]->sample_index;
-#endif
-    }
-
-    return (uint32_t) -1;
-}
-
-// Get the number of errors encountered so far
-
-int WavpackGetNumErrors (WavpackContext *wpc)
-{
-    return wpc ? wpc->crc_errors : 0;
-}
-
-// return TRUE if any uncorrected lossy blocks were actually written or read
-
-int WavpackLossyBlocks (WavpackContext *wpc)
-{
-    return wpc ? wpc->lossy_blocks : 0;
-}
-
-// Calculate the progress through the file as a double from 0.0 (for begin)
-// to 1.0 (for done). A return value of -1.0 indicates that the progress is
-// unknown.
-
-double WavpackGetProgress (WavpackContext *wpc)
-{
-    if (wpc && wpc->total_samples != (uint32_t) -1 && wpc->total_samples != 0)
-        return (double) WavpackGetSampleIndex (wpc) / wpc->total_samples;
-    else
-        return -1.0;
-}
-
-// Return the total size of the WavPack file(s) in bytes.
-
-uint32_t WavpackGetFileSize (WavpackContext *wpc)
-{
-    return wpc ? wpc->filelen + wpc->file2len : 0;
-}
-
-// Calculate the ratio of the specified WavPack file size to the size of the
-// original audio data as a double greater than 0.0 and (usually) smaller than
-// 1.0. A value greater than 1.0 represents "negative" compression and a
-// return value of 0.0 indicates that the ratio cannot be determined.
-
-double WavpackGetRatio (WavpackContext *wpc)
-{
-    if (wpc && wpc->total_samples != (uint32_t) -1 && wpc->filelen) {
-        double output_size = (double) wpc->total_samples * wpc->config.num_channels *
-            wpc->config.bytes_per_sample;
-        double input_size = (double) wpc->filelen + wpc->file2len;
-
-        if (output_size >= 1.0 && input_size >= 1.0)
-            return input_size / output_size;
-    }
-
-    return 0.0;
-}
-
-// Calculate the average bitrate of the WavPack file in bits per second. A
-// return of 0.0 indicates that the bitrate cannot be determined. An option is
-// provided to use (or not use) any attendant .wvc file.
-
-double WavpackGetAverageBitrate (WavpackContext *wpc, int count_wvc)
-{
-    if (wpc && wpc->total_samples != (uint32_t) -1 && wpc->filelen) {
-        double output_time = (double) wpc->total_samples / wpc->config.sample_rate;
-        double input_size = (double) wpc->filelen + (count_wvc ? wpc->file2len : 0);
-
-        if (output_time >= 0.1 && input_size >= 1.0)
-            return input_size * 8.0 / output_time;
-    }
-
-    return 0.0;
-}
-
-#ifndef NO_UNPACK
-
-// Calculate the bitrate of the current WavPack file block in bits per second.
-// This can be used for an "instant" bit display and gets updated from about
-// 1 to 4 times per second. A return of 0.0 indicates that the bitrate cannot
-// be determined.
-
-double WavpackGetInstantBitrate (WavpackContext *wpc)
-{
-    if (wpc && wpc->stream3)
-        return WavpackGetAverageBitrate (wpc, TRUE);
-
-    if (wpc && wpc->streams && wpc->streams [0] && wpc->streams [0]->wphdr.block_samples) {
-        double output_time = (double) wpc->streams [0]->wphdr.block_samples / wpc->config.sample_rate;
-        double input_size = 0;
-        int si;
-
-        for (si = 0; si < wpc->num_streams; ++si) {
-            if (wpc->streams [si]->blockbuff)
-                input_size += ((WavpackHeader *) wpc->streams [si]->blockbuff)->ckSize;
-
-            if (wpc->streams [si]->block2buff)
-                input_size += ((WavpackHeader *) wpc->streams [si]->block2buff)->ckSize;
-        }
-
-        if (output_time > 0.0 && input_size >= 1.0)
-            return input_size * 8.0 / output_time;
-    }
-
-    return 0.0;
-}
-
-#endif
-
-// Close the specified WavPack file and release all resources used by it.
-// Returns NULL.
-
-WavpackContext *WavpackCloseFile (WavpackContext *wpc)
-{
-    if (wpc->streams) {
-        free_streams (wpc);
-
-        if (wpc->streams [0])
-            free (wpc->streams [0]);
-
-        free (wpc->streams);
-    }
-
-#if !defined(VER4_ONLY) && !defined(NO_UNPACK)
-    if (wpc->stream3)
-        free_stream3 (wpc);
-#endif
-
-#if !defined(NO_UNPACK) || defined(INFO_ONLY)
-    if (wpc->close_files) {
-#ifndef NO_USE_FSTREAMS
-        if (wpc->wv_in != NULL)
-            fclose (wpc->wv_in);
-
-        if (wpc->wvc_in != NULL)
-            fclose (wpc->wvc_in);
-#endif
-    }
-
-    WavpackFreeWrapper (wpc);
-#endif
-
-#ifndef NO_TAGS
-    free_tag (&wpc->m_tag);
-#endif
-
-    free (wpc);
-
-    return NULL;
-}
-
-// Returns the sample rate of the specified WavPack file
-
-uint32_t WavpackGetSampleRate (WavpackContext *wpc)
-{
-    return wpc ? wpc->config.sample_rate : 44100;
-}
-
-// Returns the number of channels of the specified WavPack file. Note that
-// this is the actual number of channels contained in the file even if the
-// OPEN_2CH_MAX flag was specified when the file was opened.
-
-int WavpackGetNumChannels (WavpackContext *wpc)
-{
-    return wpc ? wpc->config.num_channels : 2;
-}
-
-// Returns the standard Microsoft channel mask for the specified WavPack
-// file. A value of zero indicates that there is no speaker assignment
-// information.
-
-int WavpackGetChannelMask (WavpackContext *wpc)
-{
-    return wpc ? wpc->config.channel_mask : 0;
-}
-
-// Return the normalization value for floating point data (valid only
-// if floating point data is present). A value of 127 indicates that
-// the floating point range is +/- 1.0. Higher values indicate a
-// larger floating point range.
-
-int WavpackGetFloatNormExp (WavpackContext *wpc)
-{
-    return wpc->config.float_norm_exp;
-}
-
-// Returns the actual number of valid bits per sample contained in the
-// original file, which may or may not be a multiple of 8. Floating data
-// always has 32 bits, integers may be from 1 to 32 bits each. When this
-// value is not a multiple of 8, then the "extra" bits are located in the
-// LSBs of the results. That is, values are right justified when unpacked
-// into ints, but are left justified in the number of bytes used by the
-// original data.
-
-int WavpackGetBitsPerSample (WavpackContext *wpc)
-{
-    return wpc ? wpc->config.bits_per_sample : 16;
-}
-
-// Returns the number of bytes used for each sample (1 to 4) in the original
-// file. This is required information for the user of this module because the
-// audio data is returned in the LOWER bytes of the long buffer and must be
-// left-shifted 8, 16, or 24 bits if normalized longs are required.
-
-int WavpackGetBytesPerSample (WavpackContext *wpc)
-{
-    return wpc ? wpc->config.bytes_per_sample : 2;
-}
-
-#if !defined(NO_UNPACK) || defined(INFO_ONLY)
-
-// If the OPEN_2CH_MAX flag is specified when opening the file, this function
-// will return the actual number of channels decoded from the file (which may
-// or may not be less than the actual number of channels, but will always be
-// 1 or 2). Normally, this will be the front left and right channels of a
-// multichannel file.
-
-int WavpackGetReducedChannels (WavpackContext *wpc)
-{
-    if (wpc)
-        return wpc->reduced_channels ? wpc->reduced_channels : wpc->config.num_channels;
-    else
-        return 2;
-}
-
-// These routines are used to access (and free) header and trailer data that
-// was retrieved from the Wavpack file. The header will be available before
-// the samples are decoded and the trailer will be available after all samples
-// have been read.
-
-uint32_t WavpackGetWrapperBytes (WavpackContext *wpc)
-{
-    return wpc ? wpc->wrapper_bytes : 0;
-}
-
-unsigned char *WavpackGetWrapperData (WavpackContext *wpc)
-{
-    return wpc ? wpc->wrapper_data : NULL;
-}
-
-void WavpackFreeWrapper (WavpackContext *wpc)
-{
-    if (wpc && wpc->wrapper_data) {
-        free (wpc->wrapper_data);
-        wpc->wrapper_data = NULL;
-        wpc->wrapper_bytes = 0;
-    }
-}
-
-// Normally the trailing wrapper will not be available when a WavPack file is first
-// opened for reading because it is stored in the final block of the file. This
-// function forces a seek to the end of the file to pick up any trailing wrapper
-// stored there (then use WavPackGetWrapper**() to obtain). This can obviously only
-// be used for seekable files (not pipes) and is not available for pre-4.0 WavPack
-// files.
-
-static void seek_riff_trailer (WavpackContext *wpc);
-
-void WavpackSeekTrailingWrapper (WavpackContext *wpc)
-{
-    if ((wpc->open_flags & OPEN_WRAPPER) &&
-        wpc->reader->can_seek (wpc->wv_in) && !wpc->stream3) {
-            uint32_t pos_save = wpc->reader->get_pos (wpc->wv_in);
-
-            seek_riff_trailer (wpc);
-            wpc->reader->set_pos_abs (wpc->wv_in, pos_save);
-    }
-}
-
-// Get any MD5 checksum stored in the metadata (should be called after reading
-// last sample or an extra seek will occur). A return value of FALSE indicates
-// that no MD5 checksum was stored.
-
-static int seek_md5 (WavpackStreamReader *reader, void *id, unsigned char data [16]);
-
-int WavpackGetMD5Sum (WavpackContext *wpc, unsigned char data [16])
-{
-    if (wpc->config.flags & CONFIG_MD5_CHECKSUM) {
-        if (wpc->config.md5_read) {
-            memcpy (data, wpc->config.md5_checksum, 16);
-            return TRUE;
-        }
-        else if (wpc->reader->can_seek (wpc->wv_in)) {
-            uint32_t pos_save = wpc->reader->get_pos (wpc->wv_in);
-
-            wpc->config.md5_read = seek_md5 (wpc->reader, wpc->wv_in, wpc->config.md5_checksum);
-            wpc->reader->set_pos_abs (wpc->wv_in, pos_save);
-
-            if (wpc->config.md5_read) {
-                memcpy (data, wpc->config.md5_checksum, 16);
-                return TRUE;
-            }
-            else
-                return FALSE;
-        }
-    }
-
-    return FALSE;
-}
-
-#endif
-
-// Free all memory allocated for raw WavPack blocks (for all allocated streams)
-// and free all additonal streams. This does not free the default stream ([0])
-// which is always kept around.
-
-static void free_streams (WavpackContext *wpc)
-{
-    int si = wpc->num_streams;
-
-    while (si--) {
-        if (wpc->streams [si]->blockbuff) {
-            free (wpc->streams [si]->blockbuff);
-            wpc->streams [si]->blockbuff = NULL;
-        }
-
-        if (wpc->streams [si]->block2buff) {
-            free (wpc->streams [si]->block2buff);
-            wpc->streams [si]->block2buff = NULL;
-        }
-
-        if (wpc->streams [si]->sample_buffer) {
-            free (wpc->streams [si]->sample_buffer);
-            wpc->streams [si]->sample_buffer = NULL;
-        }
-
-        if (wpc->streams [si]->dc.shaping_data) {
-            free (wpc->streams [si]->dc.shaping_data);
-            wpc->streams [si]->dc.shaping_data = NULL;
-        }
-
-        if (si) {
-            wpc->num_streams--;
-            free (wpc->streams [si]);
-            wpc->streams [si] = NULL;
-        }
-    }
-
-    wpc->current_stream = 0;
-}
-
-#if !defined(NO_UNPACK) || defined(INFO_ONLY)
-
-// Read from current file position until a valid 32-byte WavPack 4.0 header is
-// found and read into the specified pointer. The number of bytes skipped is
-// returned. If no WavPack header is found within 1 meg, then a -1 is returned
-// to indicate the error. No additional bytes are read past the header and it
-// is returned in the processor's native endian mode. Seeking is not required.
-
-static uint32_t read_next_header (WavpackStreamReader *reader, void *id, WavpackHeader *wphdr)
-{
-    unsigned char buffer [sizeof (*wphdr)], *sp = buffer + sizeof (*wphdr), *ep = sp;
-    uint32_t bytes_skipped = 0;
-    int bleft;
-
-    while (1) {
-        if (sp < ep) {
-            bleft = (int)(ep - sp);
-            memcpy (buffer, sp, bleft);
-        }
-        else
-            bleft = 0;
-
-        if (reader->read_bytes (id, buffer + bleft, sizeof (*wphdr) - bleft) != sizeof (*wphdr) - bleft)
-            return -1;
-
-        sp = buffer;
-
-        if (*sp++ == 'w' && *sp == 'v' && *++sp == 'p' && *++sp == 'k' &&
-            !(*++sp & 1) && sp [2] < 16 && !sp [3] && (sp [2] || sp [1] || *sp >= 24) && sp [5] == 4 &&
-            sp [4] >= (MIN_STREAM_VERS & 0xff) && sp [4] <= (MAX_STREAM_VERS & 0xff) && sp [18] < 3 && !sp [19]) {
-                memcpy (wphdr, buffer, sizeof (*wphdr));
-                little_endian_to_native (wphdr, WavpackHeaderFormat);
-                return bytes_skipped;
-            }
-
-        while (sp < ep && *sp != 'w')
-            sp++;
-
-        if ((bytes_skipped += (uint32_t)(sp - buffer)) > 1024 * 1024)
-            return -1;
-    }
-}
-
-// This function is used to seek to end of a file to determine its actual
-// length in samples by reading the last header block containing data.
-// Currently, all WavPack files contain the sample length in the first block
-// containing samples, however this might not always be the case. Obviously,
-// this function requires a seekable file or stream and leaves the file
-// pointer undefined. A return value of -1 indicates the length could not
-// be determined.
-
-static uint32_t seek_final_index (WavpackStreamReader *reader, void *id)
-{
-    uint32_t result = (uint32_t) -1, bcount;
-    WavpackHeader wphdr;
-    unsigned char *tempbuff;
-
-    if (reader->get_length (id) > 1200000L)
-        reader->set_pos_rel (id, -1048576L, SEEK_END);
-    else
-        reader->set_pos_abs (id, 0);
-
-    while (1) {
-        bcount = read_next_header (reader, id, &wphdr);
-
-        if (bcount == (uint32_t) -1)
-            return result;
-
-        tempbuff = malloc (wphdr.ckSize + 8);
-        memcpy (tempbuff, &wphdr, 32);
-
-        if (reader->read_bytes (id, tempbuff + 32, wphdr.ckSize - 24) != wphdr.ckSize - 24) {
-            free (tempbuff);
-            return result;
-        }
-
-        free (tempbuff);
-
-        if (wphdr.block_samples && (wphdr.flags & FINAL_BLOCK))
-            result = wphdr.block_index + wphdr.block_samples;
-    }
-}
-
-static int seek_md5 (WavpackStreamReader *reader, void *id, unsigned char data [16])
-{
-    unsigned char meta_id, c1, c2;
-    uint32_t bcount, meta_bc;
-    WavpackHeader wphdr;
-
-    if (reader->get_length (id) > 1200000L)
-        reader->set_pos_rel (id, -1048576L, SEEK_END);
-
-    while (1) {
-        bcount = read_next_header (reader, id, &wphdr);
-
-        if (bcount == (uint32_t) -1)
-            return FALSE;
-
-        bcount = wphdr.ckSize - sizeof (WavpackHeader) + 8;
-
-        while (bcount >= 2) {
-            if (reader->read_bytes (id, &meta_id, 1) != 1 ||
-                reader->read_bytes (id, &c1, 1) != 1)
-                    return FALSE;
-
-            meta_bc = c1 << 1;
-            bcount -= 2;
-
-            if (meta_id & ID_LARGE) {
-                if (bcount < 2 || reader->read_bytes (id, &c1, 1) != 1 ||
-                    reader->read_bytes (id, &c2, 1) != 1)
-                        return FALSE;
-
-                meta_bc += ((uint32_t) c1 << 9) + ((uint32_t) c2 << 17);
-                bcount -= 2;
-            }
-
-            if (meta_id == ID_MD5_CHECKSUM)
-                return (meta_bc == 16 && bcount >= 16 &&
-                    reader->read_bytes (id, data, 16) == 16);
-
-            reader->set_pos_rel (id, meta_bc, SEEK_CUR);
-            bcount -= meta_bc;
-        }
-    }
-}
-
-static void seek_riff_trailer (WavpackContext *wpc)
-{
-    WavpackStreamReader *reader = wpc->reader;
-    void *id = wpc->wv_in;
-    unsigned char meta_id, c1, c2;
-    uint32_t bcount, meta_bc;
-    WavpackHeader wphdr;
-
-    if (reader->get_length (id) > 1200000L)
-        reader->set_pos_rel (id, -1048576L, SEEK_END);
-
-    while (1) {
-        bcount = read_next_header (reader, id, &wphdr);
-
-        if (bcount == (uint32_t) -1)
-            return;
-
-        bcount = wphdr.ckSize - sizeof (WavpackHeader) + 8;
-
-        while (bcount >= 2) {
-            if (reader->read_bytes (id, &meta_id, 1) != 1 ||
-                reader->read_bytes (id, &c1, 1) != 1)
-                    return;
-
-            meta_bc = c1 << 1;
-            bcount -= 2;
-
-            if (meta_id & ID_LARGE) {
-                if (bcount < 2 || reader->read_bytes (id, &c1, 1) != 1 ||
-                    reader->read_bytes (id, &c2, 1) != 1)
-                        return;
-
-                meta_bc += ((uint32_t) c1 << 9) + ((uint32_t) c2 << 17);
-                bcount -= 2;
-            }
-
-            if ((meta_id & ID_UNIQUE) == ID_RIFF_TRAILER) {
-                wpc->wrapper_data = realloc (wpc->wrapper_data, wpc->wrapper_bytes + meta_bc);
-
-                if (reader->read_bytes (id, wpc->wrapper_data + wpc->wrapper_bytes, meta_bc) == meta_bc)
-                    wpc->wrapper_bytes += meta_bc;
-                else
-                    return;
-            }
-            else
-                reader->set_pos_rel (id, meta_bc, SEEK_CUR);
-
-            bcount -= meta_bc;
-        }
-    }
-}
-
-// Compare the regular wv file block header to a potential matching wvc
-// file block header and return action code based on analysis:
-//
-//   0 = use wvc block (assuming rest of block is readable)
-//   1 = bad match; try to read next wvc block
-//  -1 = bad match; ignore wvc file for this block and backup fp (if
-//       possible) and try to use this block next time
-
-static int match_wvc_header (WavpackHeader *wv_hdr, WavpackHeader *wvc_hdr)
-{
-    if (wv_hdr->block_index == wvc_hdr->block_index &&
-        wv_hdr->block_samples == wvc_hdr->block_samples) {
-            int wvi = 0, wvci = 0;
-
-            if (wv_hdr->flags == wvc_hdr->flags)
-                return 0;
-
-            if (wv_hdr->flags & INITIAL_BLOCK)
-                wvi -= 1;
-
-            if (wv_hdr->flags & FINAL_BLOCK)
-                wvi += 1;
-
-            if (wvc_hdr->flags & INITIAL_BLOCK)
-                wvci -= 1;
-
-            if (wvc_hdr->flags & FINAL_BLOCK)
-                wvci += 1;
-
-            return (wvci - wvi < 0) ? 1 : -1;
-        }
-
-    if ((int32_t)(wvc_hdr->block_index - wv_hdr->block_index) < 0)
-        return 1;
-    else
-        return -1;
-}
-
-// Read the wvc block that matches the regular wv block that has been
-// read for the current stream. If an exact match is not found then
-// we either keep reading or back up and (possibly) use the block
-// later. The skip_wvc flag is set if not matching wvc block is found
-// so that we can still decode using only the lossy version (although
-// we flag this as an error). A return of FALSE indicates a serious
-// error (not just that we missed one wvc block).
-
-static int read_wvc_block (WavpackContext *wpc)
-{
-    WavpackStream *wps = wpc->streams [wpc->current_stream];
-    uint32_t bcount, file2pos;
-    WavpackHeader wphdr;
-    int compare_result;
-
-    while (1) {
-        file2pos = wpc->reader->get_pos (wpc->wvc_in);
-        bcount = read_next_header (wpc->reader, wpc->wvc_in, &wphdr);
-
-        if (bcount == (uint32_t) -1) {
-            wps->wvc_skip = TRUE;
-            wpc->crc_errors++;
-            return FALSE;
-        }
-
-        if (wpc->open_flags & OPEN_STREAMING)
-            wphdr.block_index = wps->sample_index = 0;
-        else
-            wphdr.block_index -= wpc->initial_index;
-
-        if (wphdr.flags & INITIAL_BLOCK)
-            wpc->file2pos = file2pos + bcount;
-
-        compare_result = match_wvc_header (&wps->wphdr, &wphdr);
-
-        if (!compare_result) {
-            wps->block2buff = malloc (wphdr.ckSize + 8);
-            memcpy (wps->block2buff, &wphdr, 32);
-
-            if (wpc->reader->read_bytes (wpc->wvc_in, wps->block2buff + 32, wphdr.ckSize - 24) !=
-                wphdr.ckSize - 24 || (wphdr.flags & UNKNOWN_FLAGS)) {
-                    free (wps->block2buff);
-                    wps->block2buff = NULL;
-                    wps->wvc_skip = TRUE;
-                    wpc->crc_errors++;
-                    return FALSE;
-            }
-
-            wps->wvc_skip = FALSE;
-            memcpy (&wps->wphdr, &wphdr, 32);
-            return TRUE;
-        }
-        else if (compare_result == -1) {
-            wps->wvc_skip = TRUE;
-            wpc->reader->set_pos_rel (wpc->wvc_in, -32, SEEK_CUR);
-            wpc->crc_errors++;
-            return TRUE;
-        }
-    }
-}
-
-#ifndef NO_SEEKING
-
-// Find a valid WavPack header, searching either from the current file position
-// (or from the specified position if not -1) and store it (endian corrected)
-// at the specified pointer. The return value is the exact file position of the
-// header, although we may have actually read past it. Because this function
-// is used for seeking to a specific audio sample, it only considers blocks
-// that contain audio samples for the initial stream to be valid.
-
-#define BUFSIZE 4096
-
-static uint32_t find_header (WavpackStreamReader *reader, void *id, uint32_t filepos, WavpackHeader *wphdr)
-{
-    unsigned char *buffer = malloc (BUFSIZE), *sp = buffer, *ep = buffer;
-
-    if (filepos != (uint32_t) -1 && reader->set_pos_abs (id, filepos)) {
-        free (buffer);
-        return -1;
-    }
-
-    while (1) {
-        int bleft;
-
-        if (sp < ep) {
-            bleft = (int)(ep - sp);
-            memcpy (buffer, sp, bleft);
-            ep -= (sp - buffer);
-            sp = buffer;
-        }
-        else {
-            if (sp > ep)
-                if (reader->set_pos_rel (id, (int32_t)(sp - ep), SEEK_CUR)) {
-                    free (buffer);
-                    return -1;
-                }
-
-            sp = ep = buffer;
-            bleft = 0;
-        }
-
-        ep += reader->read_bytes (id, ep, BUFSIZE - bleft);
-
-        if (ep - sp < 32) {
-            free (buffer);
-            return -1;
-        }
-
-        while (sp + 32 <= ep)
-            if (*sp++ == 'w' && *sp == 'v' && *++sp == 'p' && *++sp == 'k' &&
-                !(*++sp & 1) && sp [2] < 16 && !sp [3] && (sp [2] || sp [1] || *sp >= 24) && sp [5] == 4 &&
-                sp [4] >= (MIN_STREAM_VERS & 0xff) && sp [4] <= (MAX_STREAM_VERS & 0xff) && sp [18] < 3 && !sp [19]) {
-                    memcpy (wphdr, sp - 4, sizeof (*wphdr));
-                    little_endian_to_native (wphdr, WavpackHeaderFormat);
-
-                    if (wphdr->block_samples && (wphdr->flags & INITIAL_BLOCK)) {
-                        free (buffer);
-                        return (uint32_t) (reader->get_pos (id) - (ep - sp + 4));
-                    }
-
-                    if (wphdr->ckSize > 1024)
-                        sp += wphdr->ckSize - 1024;
-            }
-    }
-}
-
-// Find the WavPack block that contains the specified sample. If "header_pos"
-// is zero, then no information is assumed except the total number of samples
-// in the file and its size in bytes. If "header_pos" is non-zero then we
-// assume that it is the file position of the valid header image contained in
-// the first stream and we can limit our search to either the portion above
-// or below that point. If a .wvc file is being used, then this must be called
-// for that file also.
-
-static uint32_t find_sample (WavpackContext *wpc, void *infile, uint32_t header_pos, uint32_t sample)
-{
-    WavpackStream *wps = wpc->streams [wpc->current_stream];
-    uint32_t file_pos1 = 0, file_pos2 = wpc->reader->get_length (infile);
-    uint32_t sample_pos1 = 0, sample_pos2 = wpc->total_samples;
-    double ratio = 0.96;
-    int file_skip = 0;
-
-    if (sample >= wpc->total_samples)
-        return -1;
-
-    if (header_pos && wps->wphdr.block_samples) {
-        if (wps->wphdr.block_index > sample) {
-            sample_pos2 = wps->wphdr.block_index;
-            file_pos2 = header_pos;
-        }
-        else if (wps->wphdr.block_index + wps->wphdr.block_samples <= sample) {
-            sample_pos1 = wps->wphdr.block_index;
-            file_pos1 = header_pos;
-        }
-        else
-            return header_pos;
-    }
-
-    while (1) {
-        double bytes_per_sample;
-        uint32_t seek_pos;
-
-        bytes_per_sample = file_pos2 - file_pos1;
-        bytes_per_sample /= sample_pos2 - sample_pos1;
-        seek_pos = file_pos1 + (file_skip ? 32 : 0);
-        seek_pos += (uint32_t)(bytes_per_sample * (sample - sample_pos1) * ratio);
-        seek_pos = find_header (wpc->reader, infile, seek_pos, &wps->wphdr);
-
-        if (seek_pos != (uint32_t) -1)
-            wps->wphdr.block_index -= wpc->initial_index;
-
-        if (seek_pos == (uint32_t) -1 || seek_pos >= file_pos2) {
-            if (ratio > 0.0) {
-                if ((ratio -= 0.24) < 0.0)
-                    ratio = 0.0;
-            }
-            else
-                return -1;
-        }
-        else if (wps->wphdr.block_index > sample) {
-            sample_pos2 = wps->wphdr.block_index;
-            file_pos2 = seek_pos;
-        }
-        else if (wps->wphdr.block_index + wps->wphdr.block_samples <= sample) {
-
-            if (seek_pos == file_pos1)
-                file_skip = 1;
-            else {
-                sample_pos1 = wps->wphdr.block_index;
-                file_pos1 = seek_pos;
-            }
-        }
-        else
-            return seek_pos;
-    }
-}
-
-#endif
-
-#endif
-
-void WavpackLittleEndianToNative (void *data, char *format)
-{
-    little_endian_to_native (data, format);
-}
-
-void WavpackNativeToLittleEndian (void *data, char *format)
-{
-    native_to_little_endian (data, format);
-}
-
-uint32_t WavpackGetLibraryVersion (void)
-{
-    return (LIBWAVPACK_MAJOR<<16)
-          |(LIBWAVPACK_MINOR<<8)
-          |(LIBWAVPACK_MICRO<<0);
-}
-
-const char *WavpackGetLibraryVersionString (void)
-{
-    return LIBWAVPACK_VERSION_STRING;
-}
-
diff --git a/third_party/wavpack/src/write_words.c b/third_party/wavpack/src/write_words.c
new file mode 100644
index 0000000..6e6c6b6
--- /dev/null
+++ b/third_party/wavpack/src/write_words.c
@@ -0,0 +1,688 @@
+////////////////////////////////////////////////////////////////////////////
+//                           **** WAVPACK ****                            //
+//                  Hybrid Lossless Wavefile Compressor                   //
+//              Copyright (c) 1998 - 2013 Conifer Software.               //
+//                          All Rights Reserved.                          //
+//      Distributed under the BSD Software License (see license.txt)      //
+////////////////////////////////////////////////////////////////////////////
+
+// write_words.c
+
+// This module provides entropy word encoding functions using
+// a variation on the Rice method.  This was introduced in version 3.93
+// because it allows splitting the data into a "lossy" stream and a
+// "correction" stream in a very efficient manner and is therefore ideal
+// for the "hybrid" mode.  For 4.0, the efficiency of this method was
+// significantly improved by moving away from the normal Rice restriction of
+// using powers of two for the modulus divisions and now the method can be
+// used for both hybrid and pure lossless encoding.
+
+// Samples are divided by median probabilities at 5/7 (71.43%), 10/49 (20.41%),
+// and 20/343 (5.83%). Each zone has 3.5 times fewer samples than the
+// previous. Using standard Rice coding on this data would result in 1.4
+// bits per sample average (not counting sign bit). However, there is a
+// very simple encoding that is over 99% efficient with this data and
+// results in about 1.22 bits per sample.
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "wavpack_local.h"
+
+///////////////////////////// executable code ////////////////////////////////
+
+// Initialize entropy encoder for the specified stream. In lossless mode there
+// are no parameters to select; in hybrid mode the bitrate mode and value need
+// be initialized.
+
+static void word_set_bitrate (WavpackStream *wps);
+
+void init_words (WavpackStream *wps)
+{
+    CLEAR (wps->w);
+
+    if (wps->wphdr.flags & HYBRID_FLAG)
+        word_set_bitrate (wps);
+}
+
+// Set up parameters for hybrid mode based on header flags and "bits" field.
+// This is currently only set up for the HYBRID_BITRATE mode in which the
+// allowed error varies with the residual level (from "slow_level"). The
+// simpler mode (which is not used yet) has the error level directly
+// controlled from the metadata.
+
+static void word_set_bitrate (WavpackStream *wps)
+{
+    int bitrate_0, bitrate_1;
+
+    if (wps->wphdr.flags & HYBRID_BITRATE) {
+        if (wps->wphdr.flags & FALSE_STEREO)
+            bitrate_0 = (wps->bits * 2 - 512) < 568 ? 0 : (wps->bits * 2 - 512) - 568;
+        else
+            bitrate_0 = wps->bits < 568 ? 0 : wps->bits - 568;
+
+        if (!(wps->wphdr.flags & MONO_DATA)) {
+
+            if (wps->wphdr.flags & HYBRID_BALANCE)
+                bitrate_1 = (wps->wphdr.flags & JOINT_STEREO) ? 256 : 0;
+            else {
+                bitrate_1 = bitrate_0;
+
+                if (wps->wphdr.flags & JOINT_STEREO) {
+                    if (bitrate_0 < 128) {
+                        bitrate_1 += bitrate_0;
+                        bitrate_0 = 0;
+                    }
+                    else {
+                        bitrate_0 -= 128;
+                        bitrate_1 += 128;
+                    }
+                }
+            }
+        }
+        else
+            bitrate_1 = 0;
+    }
+    else
+        bitrate_0 = bitrate_1 = 0;
+
+    wps->w.bitrate_acc [0] = (int32_t) bitrate_0 << 16;
+    wps->w.bitrate_acc [1] = (int32_t) bitrate_1 << 16;
+}
+
+// Allocates the correct space in the metadata structure and writes the
+// current median values to it. Values are converted from 32-bit unsigned
+// to our internal 16-bit wp_log2 values, and read_entropy_vars () is called
+// to read the values back because we must compensate for the loss through
+// the log function.
+
+void write_entropy_vars (WavpackStream *wps, WavpackMetadata *wpmd)
+{
+    unsigned char *byteptr;
+    int temp;
+
+    byteptr = wpmd->data = malloc (12);
+    wpmd->id = ID_ENTROPY_VARS;
+
+    *byteptr++ = temp = wp_log2 (wps->w.c [0].median [0]);
+    *byteptr++ = temp >> 8;
+    *byteptr++ = temp = wp_log2 (wps->w.c [0].median [1]);
+    *byteptr++ = temp >> 8;
+    *byteptr++ = temp = wp_log2 (wps->w.c [0].median [2]);
+    *byteptr++ = temp >> 8;
+
+    if (!(wps->wphdr.flags & MONO_DATA)) {
+        *byteptr++ = temp = wp_log2 (wps->w.c [1].median [0]);
+        *byteptr++ = temp >> 8;
+        *byteptr++ = temp = wp_log2 (wps->w.c [1].median [1]);
+        *byteptr++ = temp >> 8;
+        *byteptr++ = temp = wp_log2 (wps->w.c [1].median [2]);
+        *byteptr++ = temp >> 8;
+    }
+
+    wpmd->byte_length = (int32_t)(byteptr - (unsigned char *) wpmd->data);
+    read_entropy_vars (wps, wpmd);
+}
+
+// Allocates enough space in the metadata structure and writes the current
+// high word of the bitrate accumulator and the slow_level values to it. The
+// slow_level values are converted from 32-bit unsigned to our internal 16-bit
+// wp_log2 values. Afterward, read_entropy_vars () is called to read the values
+// back because we must compensate for the loss through the log function and
+// the truncation of the bitrate.
+
+void write_hybrid_profile (WavpackStream *wps, WavpackMetadata *wpmd)
+{
+    unsigned char *byteptr;
+    int temp;
+
+    word_set_bitrate (wps);
+    byteptr = wpmd->data = malloc (512);
+    wpmd->id = ID_HYBRID_PROFILE;
+
+    if (wps->wphdr.flags & HYBRID_BITRATE) {
+        *byteptr++ = temp = wp_log2s (wps->w.c [0].slow_level);
+        *byteptr++ = temp >> 8;
+
+        if (!(wps->wphdr.flags & MONO_DATA)) {
+            *byteptr++ = temp = wp_log2s (wps->w.c [1].slow_level);
+            *byteptr++ = temp >> 8;
+        }
+    }
+
+    *byteptr++ = temp = wps->w.bitrate_acc [0] >> 16;
+    *byteptr++ = temp >> 8;
+
+    if (!(wps->wphdr.flags & MONO_DATA)) {
+        *byteptr++ = temp = wps->w.bitrate_acc [1] >> 16;
+        *byteptr++ = temp >> 8;
+    }
+
+    if (wps->w.bitrate_delta [0] | wps->w.bitrate_delta [1]) {
+        *byteptr++ = temp = wp_log2s (wps->w.bitrate_delta [0]);
+        *byteptr++ = temp >> 8;
+
+        if (!(wps->wphdr.flags & MONO_DATA)) {
+            *byteptr++ = temp = wp_log2s (wps->w.bitrate_delta [1]);
+            *byteptr++ = temp >> 8;
+        }
+    }
+
+    wpmd->byte_length = (int32_t)(byteptr - (unsigned char *) wpmd->data);
+    read_hybrid_profile (wps, wpmd);
+}
+
+// This function writes the specified word to the open bitstream "wvbits" and,
+// if the bitstream "wvcbits" is open, writes any correction data there. This
+// function will work for either lossless or hybrid but because a version
+// optimized for lossless exits below, it would normally be used for the hybrid
+// mode only. The return value is the actual value stored to the stream (even
+// if a correction file is being created) and is used as feedback to the
+// predictor.
+
+int32_t FASTCALL send_word (WavpackStream *wps, int32_t value, int chan)
+{
+    struct entropy_data *c = wps->w.c + chan;
+    uint32_t ones_count, low, mid, high;
+    int sign = (value < 0) ? 1 : 0;
+
+    if (wps->w.c [0].median [0] < 2 && !wps->w.holding_zero && wps->w.c [1].median [0] < 2) {
+        if (wps->w.zeros_acc) {
+            if (value)
+                flush_word (wps);
+            else {
+                c->slow_level -= (c->slow_level + SLO) >> SLS;
+                wps->w.zeros_acc++;
+                return 0;
+            }
+        }
+        else if (value)
+            putbit_0 (&wps->wvbits);
+        else {
+            c->slow_level -= (c->slow_level + SLO) >> SLS;
+            CLEAR (wps->w.c [0].median);
+            CLEAR (wps->w.c [1].median);
+            wps->w.zeros_acc = 1;
+            return 0;
+        }
+    }
+
+    if (sign)
+        value = ~value;
+
+    if ((wps->wphdr.flags & HYBRID_FLAG) && !chan)
+        update_error_limit (wps);
+
+    if (value < (int32_t) GET_MED (0)) {
+        ones_count = low = 0;
+        high = GET_MED (0) - 1;
+        DEC_MED0 ();
+    }
+    else {
+        low = GET_MED (0);
+        INC_MED0 ();
+
+        if (value - low < GET_MED (1)) {
+            ones_count = 1;
+            high = low + GET_MED (1) - 1;
+            DEC_MED1 ();
+        }
+        else {
+            low += GET_MED (1);
+            INC_MED1 ();
+
+            if (value - low < GET_MED (2)) {
+                ones_count = 2;
+                high = low + GET_MED (2) - 1;
+                DEC_MED2 ();
+            }
+            else {
+                ones_count = 2 + (value - low) / GET_MED (2);
+                low += (ones_count - 2) * GET_MED (2);
+                high = low + GET_MED (2) - 1;
+                INC_MED2 ();
+            }
+        }
+    }
+
+    mid = (high + low + 1) >> 1;
+
+    if (wps->w.holding_zero) {
+        if (ones_count)
+            wps->w.holding_one++;
+
+        flush_word (wps);
+
+        if (ones_count) {
+            wps->w.holding_zero = 1;
+            ones_count--;
+        }
+        else
+            wps->w.holding_zero = 0;
+    }
+    else
+        wps->w.holding_zero = 1;
+
+    wps->w.holding_one = ones_count * 2;
+
+    if (!c->error_limit) {
+        if (high != low) {
+            uint32_t maxcode = high - low, code = value - low;
+            int bitcount = count_bits (maxcode);
+            uint32_t extras = bitset [bitcount] - maxcode - 1;
+
+            if (code < extras) {
+                wps->w.pend_data |= code << wps->w.pend_count;
+                wps->w.pend_count += bitcount - 1;
+            }
+            else {
+                wps->w.pend_data |= ((code + extras) >> 1) << wps->w.pend_count;
+                wps->w.pend_count += bitcount - 1;
+                wps->w.pend_data |= ((code + extras) & 1) << wps->w.pend_count++;
+            }
+        }
+
+        mid = value;
+    }
+    else
+        while (high - low > c->error_limit)
+            if (value < (int32_t) mid) {
+                mid = ((high = mid - 1) + low + 1) >> 1;
+                wps->w.pend_count++;
+            }
+            else {
+                mid = (high + (low = mid) + 1) >> 1;
+                wps->w.pend_data |= bitset [wps->w.pend_count++];
+            }
+
+    wps->w.pend_data |= ((int32_t) sign << wps->w.pend_count++);
+
+    if (!wps->w.holding_zero)
+        flush_word (wps);
+
+    if (bs_is_open (&wps->wvcbits) && c->error_limit) {
+        uint32_t code = value - low, maxcode = high - low;
+        int bitcount = count_bits (maxcode);
+        uint32_t extras = bitset [bitcount] - maxcode - 1;
+
+        if (bitcount) {
+            if (code < extras)
+                putbits (code, bitcount - 1, &wps->wvcbits);
+            else {
+                putbits ((code + extras) >> 1, bitcount - 1, &wps->wvcbits);
+                putbit ((code + extras) & 1, &wps->wvcbits);
+            }
+        }
+    }
+
+    if (wps->wphdr.flags & HYBRID_BITRATE) {
+        c->slow_level -= (c->slow_level + SLO) >> SLS;
+        c->slow_level += wp_log2 (mid);
+    }
+
+    return sign ? ~mid : mid;
+}
+
+// This function is an optimized version of send_word() that only handles
+// lossless (error_limit == 0) and sends an entire buffer of either mono or
+// stereo data rather than a single sample. Unlike the generalized
+// send_word(), it does not return values because it always encodes
+// the exact value passed.
+
+void send_words_lossless (WavpackStream *wps, int32_t *buffer, int32_t nsamples)
+{
+    struct entropy_data *c = wps->w.c;
+    int32_t value, csamples;
+
+    if (!(wps->wphdr.flags & MONO_DATA))
+        nsamples *= 2;
+
+    for (csamples = 0; csamples < nsamples; ++csamples) {
+        int sign = ((value = *buffer++) < 0) ? 1 : 0;
+        uint32_t ones_count, low, high;
+
+        if (!(wps->wphdr.flags & MONO_DATA))
+            c = wps->w.c + (csamples & 1);
+
+        if (wps->w.c [0].median [0] < 2 && !wps->w.holding_zero && wps->w.c [1].median [0] < 2) {
+            if (wps->w.zeros_acc) {
+                if (value)
+                    flush_word (wps);
+                else {
+                    wps->w.zeros_acc++;
+                    continue;
+                }
+            }
+            else if (value)
+                putbit_0 (&wps->wvbits);
+            else {
+                CLEAR (wps->w.c [0].median);
+                CLEAR (wps->w.c [1].median);
+                wps->w.zeros_acc = 1;
+                continue;
+            }
+        }
+
+        if (sign)
+            value = ~value;
+
+        if (value < (int32_t) GET_MED (0)) {
+            ones_count = low = 0;
+            high = GET_MED (0) - 1;
+            DEC_MED0 ();
+        }
+        else {
+            low = GET_MED (0);
+            INC_MED0 ();
+
+            if (value - low < GET_MED (1)) {
+                ones_count = 1;
+                high = low + GET_MED (1) - 1;
+                DEC_MED1 ();
+            }
+            else {
+                low += GET_MED (1);
+                INC_MED1 ();
+
+                if (value - low < GET_MED (2)) {
+                    ones_count = 2;
+                    high = low + GET_MED (2) - 1;
+                    DEC_MED2 ();
+                }
+                else {
+                    ones_count = 2 + (value - low) / GET_MED (2);
+                    low += (ones_count - 2) * GET_MED (2);
+                    high = low + GET_MED (2) - 1;
+                    INC_MED2 ();
+                }
+            }
+        }
+
+        if (wps->w.holding_zero) {
+            if (ones_count)
+                wps->w.holding_one++;
+
+            flush_word (wps);
+
+            if (ones_count) {
+                wps->w.holding_zero = 1;
+                ones_count--;
+            }
+            else
+                wps->w.holding_zero = 0;
+        }
+        else
+            wps->w.holding_zero = 1;
+
+        wps->w.holding_one = ones_count * 2;
+
+        if (high != low) {
+            uint32_t maxcode = high - low, code = value - low;
+            int bitcount = count_bits (maxcode);
+            uint32_t extras = bitset [bitcount] - maxcode - 1;
+
+            if (code < extras) {
+                wps->w.pend_data |= code << wps->w.pend_count;
+                wps->w.pend_count += bitcount - 1;
+            }
+            else {
+                wps->w.pend_data |= ((code + extras) >> 1) << wps->w.pend_count;
+                wps->w.pend_count += bitcount - 1;
+                wps->w.pend_data |= ((code + extras) & 1) << wps->w.pend_count++;
+            }
+        }
+
+        wps->w.pend_data |= ((int32_t) sign << wps->w.pend_count++);
+
+        if (!wps->w.holding_zero)
+            flush_word (wps);
+    }
+}
+
+// Used by send_word() and send_word_lossless() to actually send most the
+// accumulated data onto the bitstream. This is also called directly from
+// clients when all words have been sent.
+
+void flush_word (WavpackStream *wps)
+{
+    if (wps->w.zeros_acc) {
+        int cbits = count_bits (wps->w.zeros_acc);
+
+        while (cbits--)
+            putbit_1 (&wps->wvbits);
+
+        putbit_0 (&wps->wvbits);
+
+        while (wps->w.zeros_acc > 1) {
+            putbit (wps->w.zeros_acc & 1, &wps->wvbits);
+            wps->w.zeros_acc >>= 1;
+        }
+
+        wps->w.zeros_acc = 0;
+    }
+
+    if (wps->w.holding_one) {
+#ifdef LIMIT_ONES
+        if (wps->w.holding_one >= LIMIT_ONES) {
+            int cbits;
+
+            putbits ((1L << LIMIT_ONES) - 1, LIMIT_ONES + 1, &wps->wvbits);
+            wps->w.holding_one -= LIMIT_ONES;
+            cbits = count_bits (wps->w.holding_one);
+
+            while (cbits--)
+                putbit_1 (&wps->wvbits);
+
+            putbit_0 (&wps->wvbits);
+
+            while (wps->w.holding_one > 1) {
+                putbit (wps->w.holding_one & 1, &wps->wvbits);
+                wps->w.holding_one >>= 1;
+            }
+
+            wps->w.holding_zero = 0;
+        }
+        else
+            putbits (bitmask [wps->w.holding_one], wps->w.holding_one, &wps->wvbits);
+
+        wps->w.holding_one = 0;
+#else
+        do {
+            putbit_1 (&wps->wvbits);
+        } while (--wps->w.holding_one);
+#endif
+    }
+
+    if (wps->w.holding_zero) {
+        putbit_0 (&wps->wvbits);
+        wps->w.holding_zero = 0;
+    }
+
+    if (wps->w.pend_count) {
+        putbits (wps->w.pend_data, wps->w.pend_count, &wps->wvbits);
+        wps->w.pend_data = wps->w.pend_count = 0;
+    }
+}
+
+// This function is similar to send_word() except that no data is actually
+// written to any stream, but it does return the value that would have been
+// sent to a hybrid stream. It is used to determine beforehand how much noise
+// will be added to samples.
+
+int32_t nosend_word (WavpackStream *wps, int32_t value, int chan)
+{
+    struct entropy_data *c = wps->w.c + chan;
+    uint32_t ones_count, low, mid, high;
+    int sign = (value < 0) ? 1 : 0;
+
+    if (sign)
+        value = ~value;
+
+    if ((wps->wphdr.flags & HYBRID_FLAG) && !chan)
+        update_error_limit (wps);
+
+    if (value < (int32_t) GET_MED (0)) {
+        low = 0;
+        high = GET_MED (0) - 1;
+        DEC_MED0 ();
+    }
+    else {
+        low = GET_MED (0);
+        INC_MED0 ();
+
+        if (value - low < GET_MED (1)) {
+            high = low + GET_MED (1) - 1;
+            DEC_MED1 ();
+        }
+        else {
+            low += GET_MED (1);
+            INC_MED1 ();
+
+            if (value - low < GET_MED (2)) {
+                high = low + GET_MED (2) - 1;
+                DEC_MED2 ();
+            }
+            else {
+                ones_count = 2 + (value - low) / GET_MED (2);
+                low += (ones_count - 2) * GET_MED (2);
+                high = low + GET_MED (2) - 1;
+                INC_MED2 ();
+            }
+        }
+    }
+
+    mid = (high + low + 1) >> 1;
+
+    if (!c->error_limit)
+        mid = value;
+    else
+        while (high - low > c->error_limit)
+            if (value < (int32_t) mid)
+                mid = ((high = mid - 1) + low + 1) >> 1;
+            else
+                mid = (high + (low = mid) + 1) >> 1;
+
+    c->slow_level -= (c->slow_level + SLO) >> SLS;
+    c->slow_level += wp_log2 (mid);
+
+    return sign ? ~mid : mid;
+}
+
+// This function is used to scan some number of samples to set the variables
+// "slow_level" and the "median" array. In pure symetrical encoding mode this
+// would not be needed because these values would simply be continued from the
+// previous block. However, in the -X modes and the 32-bit modes we cannot do
+// this because parameters may change between blocks and the variables might
+// not apply. This function can work in mono or stereo and can scan a block
+// in either direction.
+
+static void scan_word_pass (WavpackStream *wps, int32_t *samples, uint32_t num_samples, int dir)
+{
+    uint32_t flags = wps->wphdr.flags, value, low;
+    struct entropy_data *c = wps->w.c;
+    int chan;
+
+    if (flags & MONO_DATA) {
+        if (dir < 0) {
+            samples += (num_samples - 1);
+            dir = -1;
+        }
+        else
+            dir = 1;
+    }
+    else {
+        if (dir < 0) {
+            samples += (num_samples - 1) * 2;
+            dir = -2;
+        }
+        else
+            dir = 2;
+    }
+
+    while (num_samples--) {
+
+        value = labs (samples [chan = 0]);
+
+        if (flags & HYBRID_BITRATE) {
+            wps->w.c [0].slow_level -= (wps->w.c [0].slow_level + SLO) >> SLS;
+            wps->w.c [0].slow_level += wp_log2 (value);
+        }
+
+        if (value < GET_MED (0)) {
+            DEC_MED0 ();
+        }
+        else {
+            low = GET_MED (0);
+            INC_MED0 ();
+
+            if (value - low < GET_MED (1)) {
+                DEC_MED1 ();
+            }
+            else {
+                low += GET_MED (1);
+                INC_MED1 ();
+
+                if (value - low < GET_MED (2)) {
+                    DEC_MED2 ();
+                }
+                else {
+                    INC_MED2 ();
+                }
+            }
+        }
+
+        if (!(flags & MONO_DATA)) {
+            value = labs (samples [chan = 1]);
+            c++;
+
+            if (wps->wphdr.flags & HYBRID_BITRATE) {
+                wps->w.c [1].slow_level -= (wps->w.c [1].slow_level + SLO) >> SLS;
+                wps->w.c [1].slow_level += wp_log2 (value);
+            }
+
+            if (value < GET_MED (0)) {
+                DEC_MED0 ();
+            }
+            else {
+                low = GET_MED (0);
+                INC_MED0 ();
+
+                if (value - low < GET_MED (1)) {
+                    DEC_MED1 ();
+                }
+                else {
+                    low += GET_MED (1);
+                    INC_MED1 ();
+
+                    if (value - low < GET_MED (2)) {
+                        DEC_MED2 ();
+                    }
+                    else {
+                        INC_MED2 ();
+                    }
+                }
+            }
+
+            c--;
+        }
+
+        samples += dir;
+    }
+}
+
+// Wrapper for scan_word_pass() than ensures that at least 2048 samples are processed by
+// potentially making multiple passes through the data. See description of scan_word_pass()
+// for more details.
+
+void scan_word (WavpackStream *wps, int32_t *samples, uint32_t num_samples, int dir)
+{
+    init_words (wps);
+
+    if (num_samples) {
+        int passes = (2048 + num_samples - 1) / num_samples;    // i.e., ceil (2048.0 / num_samples)
+
+        while (passes--)
+            scan_word_pass (wps, samples, num_samples, dir);
+    }
+}
+