diff --git a/src/Graphics/OpenGLContext/GLSL/glsl_CombinerProgramUniformFactoryAccurate.cpp b/src/Graphics/OpenGLContext/GLSL/glsl_CombinerProgramUniformFactoryAccurate.cpp
index f5bf4a49e..31600ddc0 100644
--- a/src/Graphics/OpenGLContext/GLSL/glsl_CombinerProgramUniformFactoryAccurate.cpp
+++ b/src/Graphics/OpenGLContext/GLSL/glsl_CombinerProgramUniformFactoryAccurate.cpp
@@ -111,7 +111,7 @@ class UMipmap : public UniformGroup
 			maxTile = std::min(gSP.texture.level, 1u); // Hack for HD textures
 		uMaxTile.set(maxTile, _force);
 
-		bool bNoAtlasTex = (_pTexture != nullptr && _pTexture->bHDTexture) ||
+		bool bNoAtlasTex = (_pTexture != nullptr && _pTexture->bHDTexture && _pTexture->max_level == 0) ||
 							maxTile == 0 ||
 							gDP.otherMode.textureLOD != G_TL_LOD ||
 							(gDP.otherMode.textureDetail != G_TD_DETAIL && maxTile == 1);
diff --git a/src/GraphicsDrawer.h b/src/GraphicsDrawer.h
index 5f0c307c5..cd1d24ff6 100644
--- a/src/GraphicsDrawer.h
+++ b/src/GraphicsDrawer.h
@@ -21,7 +21,7 @@ struct FrameBuffer;
 #define ELEMBUFF_SIZE 1024U
 
 constexpr f32 SCREEN_SIZE_DIM = 640.0f;
-constexpr u32 MIPMAP_TILE_WIDTH = 256u;
+constexpr u32 MIPMAP_TILE_WIDTH = 512u;
 
 enum class DrawingState
 {
diff --git a/src/Textures.cpp b/src/Textures.cpp
index 625b21e94..9b308be38 100644
--- a/src/Textures.cpp
+++ b/src/Textures.cpp
@@ -3,6 +3,7 @@
 #include <algorithm>
 #include <thread>         // std::this_thread::sleep_for
 #include <chrono>         // std::chrono::seconds
+#include <lancir.h>
 #include "Platform.h"
 #include "Textures.h"
 #include "GBI.h"
@@ -1151,6 +1152,80 @@ void TextureCache::_loadBackground(CachedTexture *pTexture)
 	free(pDest);
 }
 
+void _loadHiresTextureMipMapAccurate(CachedTexture *_pTexture, GHQTexInfo const& _ghqTexInfo, u64 _ricecrc)
+{
+	u32 texWidth = _ghqTexInfo.width;
+	u32 texHeight = _ghqTexInfo.height;
+	unsigned int totalTexSize = std::max(static_cast<u32>(texWidth * texHeight + 16), MIPMAP_TILE_WIDTH)
+		* (_pTexture->max_level + 1);
+
+	std::vector<u32> m_tempTextureHolder(totalTexSize);
+	std::vector<u32> tileData(texWidth * texHeight / 4);
+	u32* pTileData = reinterpret_cast<u32*>(_ghqTexInfo.data);
+
+	u32 mipLevel = 0;
+	u32 texDataOffset = 16; // number of gDP.tiles * 2
+	avir::CLancIR imageResizer;
+
+	// Load all tiles into one 1D texture atlas.
+	u32 mipRatioS = gDP.tiles[gSP.texture.tile + 1].shifts + 5u;
+	if (mipRatioS >= 16u) mipRatioS -= 16u;
+	u32 mipRatioT = gDP.tiles[gSP.texture.tile + 1].shiftt + 5u;
+	if (mipRatioT >= 16) mipRatioT -= 16u;
+	while (true)
+	{
+		const u32 tileSizePacked = texWidth | (mipRatioT << 16) | (mipRatioS << 24);
+		m_tempTextureHolder[mipLevel * 2] = texDataOffset;
+		m_tempTextureHolder[mipLevel * 2 + 1] = tileSizePacked;
+
+		txfilter_dmptx((u8*)pTileData, texWidth, texHeight,
+			texWidth, (u16)_ghqTexInfo.format,
+			N64FormatSize(_pTexture->format, _pTexture->size),
+			_ricecrc + mipLevel);
+
+		std::copy_n(pTileData, texWidth * texHeight, &m_tempTextureHolder[texDataOffset]);
+		pTileData = &m_tempTextureHolder[texDataOffset];
+		texDataOffset += texWidth * texHeight;
+		if (mipLevel == _pTexture->max_level)
+			break;
+
+		++mipLevel;
+		u32 mipRatioSNew = gDP.tiles[gSP.texture.tile + mipLevel + 1].shifts + 5u;
+		if (mipRatioSNew >= 16u) mipRatioSNew -= 16u;
+		u32 mipRatioTNew = gDP.tiles[gSP.texture.tile + mipLevel + 1].shiftt + 5u;
+		if (mipRatioTNew >= 16u) mipRatioTNew -= 16u;
+		u32 shifts = mipRatioSNew - mipRatioS;
+		u32 shiftt = mipRatioTNew - mipRatioT;
+		if (shifts > 0 || shiftt > 0) {
+			imageResizer.resizeImage((u8*)pTileData, texWidth, texHeight, (u8*)tileData.data(), texWidth >> shifts, texHeight >> shiftt, 4);
+			texWidth >>= shifts;
+			texHeight >>= shiftt;
+			mipRatioS = mipRatioSNew;
+			mipRatioT = mipRatioTNew;
+			pTileData = tileData.data();
+		}
+	}
+
+	u32 texformat = gfxContext.convertInternalTextureFormat(_ghqTexInfo.format);
+	Context::InitTextureParams params;
+	params.handle = _pTexture->name;
+	params.textureUnitIndex = textureIndices::Tex[1];
+	params.mipMapLevel = 0;
+	params.mipMapLevels = 1;
+	params.msaaLevel = 0;
+	params.width = std::min(texDataOffset, MIPMAP_TILE_WIDTH);
+	params.height = (texDataOffset / MIPMAP_TILE_WIDTH) + ((texDataOffset % MIPMAP_TILE_WIDTH) ? 1 : 0);
+	params.internalFormat = InternalColorFormatParam(texformat);
+	params.format = ColorFormatParam(_ghqTexInfo.texture_format);
+	params.dataType = DatatypeParam(_ghqTexInfo.pixel_type);
+	params.data = m_tempTextureHolder.data();
+	gfxContext.init2DTexture(params);
+	assert(!gfxContext.isError());
+	_pTexture->mipmapAtlasWidth = params.width;
+	_pTexture->mipmapAtlasHeight = params.height;
+	_pTexture->textureBytes = texDataOffset << 2;
+}
+
 bool TextureCache::_loadHiresTexture(u32 _tile, CachedTexture *_pTexture, u64 & _ricecrc, u64 & _strongcrc)
 {
 	if (config.textureFilter.txHiresEnable == 0 || !TFH.isInited())
@@ -1233,6 +1308,21 @@ bool TextureCache::_loadHiresTexture(u32 _tile, CachedTexture *_pTexture, u64 &
 		hirestexFound = txfilter_hirestex(_pTexture->crc, _strongcrc, palette, N64FormatSize(_pTexture->format, _pTexture->size), &ghqTexInfo);
 	}
 	if (hirestexFound && ghqTexInfo.width != 0 && ghqTexInfo.height != 0) {
+		if (config.generalEmulation.enableInaccurateTextureCoordinates == 0 &&
+			_tile > 0 &&
+			currentCombiner()->usesLOD() &&
+			gSP.texture.level > 1) {
+			_pTexture->max_level = gDP.otherMode.textureDetail == G_TD_DETAIL ?
+				static_cast<u8>(gSP.texture.level) :
+				static_cast<u8>(gSP.texture.level - 1);
+		}
+		if (_pTexture->max_level > 0)
+		{
+			_loadHiresTextureMipMapAccurate(_pTexture, ghqTexInfo, _ricecrc);
+			_updateCachedTexture(ghqTexInfo, _pTexture, width, height);
+			return true;
+		}
+
 		ghqTexInfo.format = gfxContext.convertInternalTextureFormat(ghqTexInfo.format);
 		Context::InitTextureParams params;
 		params.handle = _pTexture->name;
@@ -1604,6 +1694,12 @@ void TextureCache::_loadFast(u32 _tile, CachedTexture *_pTexture)
 
 void TextureCache::_loadAccurate(u32 _tile, CachedTexture *_pTexture)
 {
+	gDPTile * pTile = _tile < 2 ? gSP.textureTile[_tile] : &gDP.tiles[_tile];
+	const u32 tMemMask = gDP.otherMode.textureLUT == G_TT_NONE ? 0x1FF : 0xFF;
+	gDPLoadTileInfo &info = gDP.loadInfo[pTile->tmem & tMemMask];
+	if (info.texAddress == 0x0071a0f0 || info.texAddress == 0x00719ef0)
+		int t = 0;
+
 	u64 ricecrc = 0;
 	u64 strongcrc = 0;
 	if (_loadHiresTexture(_tile, _pTexture, ricecrc, strongcrc))
@@ -1666,7 +1762,6 @@ void TextureCache::_loadAccurate(u32 _tile, CachedTexture *_pTexture)
 		// Load all tiles into one 1D texture atlas.
 		while (true)
 		{
-
 			u32 mipRatioS = gDP.tiles[gSP.texture.tile + mipLevel + 1].shifts + 5u;
 			if (mipRatioS >= 16u) mipRatioS -= 16u;
 			u32 mipRatioT = gDP.tiles[gSP.texture.tile + mipLevel + 1].shiftt + 5u;
@@ -1699,7 +1794,6 @@ void TextureCache::_loadAccurate(u32 _tile, CachedTexture *_pTexture)
 			++mipLevel;
 			const u32 tileMipLevel = gSP.texture.tile + mipLevel + 1;
 			gDPTile & mipTile = gDP.tiles[tileMipLevel];
-			gDPTile & prevMipTile = gDP.tiles[tileMipLevel - 1];
 			line = mipTile.line;
 			tmptex.tMem = mipTile.tmem;
 			tmptex.palette = mipTile.palette;
diff --git a/src/inc/lancir.h b/src/inc/lancir.h
new file mode 100644
index 000000000..12e5c8207
--- /dev/null
+++ b/src/inc/lancir.h
@@ -0,0 +1,2446 @@
+//$ nobt
+//$ nocpp
+
+/**
+ * @file lancir.h
+ *
+ * @version 3.0.11
+ *
+ * @brief The self-contained header-only "LANCIR" image resizing algorithm.
+ *
+ * This is the self-contained inclusion file for the "LANCIR" image resizer,
+ * a part of the AVIR library. Features scalar, AVX, SSE2, and NEON
+ * optimizations as well as batched resizing technique which provides a better
+ * CPU cache performance.
+ *
+ * AVIR Copyright (c) 2015-2024 Aleksey Vaneev
+ *
+ * @mainpage
+ *
+ * @section intro_sec Introduction
+ *
+ * Description is available at https://github.com/avaneev/avir
+ *
+ * @section license License
+ *
+ * LICENSE:
+ *
+ * Copyright (c) 2015-2024 Aleksey Vaneev
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef AVIR_CLANCIR_INCLUDED
+#define AVIR_CLANCIR_INCLUDED
+
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+
+#if defined( __AVX__ ) || defined( __AVX2__ )
+
+	#include <immintrin.h>
+
+	#define LANCIR_AVX
+	#define LANCIR_SSE2 // Some functions use SSE2; AVX has a higher priority.
+	#define LANCIR_ALIGN 32
+
+#elif defined( __SSE4_2__ ) || defined( __SSE4_1__ ) || \
+	defined( __SSSE3__ ) || defined( __SSE3__ ) || defined( __SSE2__ ) || \
+	defined( __x86_64__ ) || defined( __amd64 ) || defined( _M_X64 ) || \
+	defined( _M_AMD64 ) || ( defined( _M_IX86_FP ) && _M_IX86_FP == 2 )
+
+	#if defined( _MSC_VER )
+		#include <intrin.h>
+	#else // defined( _MSC_VER )
+		#include <emmintrin.h>
+	#endif // defined( _MSC_VER )
+
+	#define LANCIR_SSE2
+	#define LANCIR_ALIGN 16
+
+#elif defined( __aarch64__ ) || defined( __arm64 ) || defined( __ARM_NEON )
+
+	#include <arm_neon.h>
+
+	#define LANCIR_NEON
+	#define LANCIR_ALIGN 16
+
+#else // NEON
+
+	#define LANCIR_ALIGN 4
+
+#endif // NEON
+
+namespace avir {
+
+/**
+ * @brief LANCIR resizing parameters class.
+ *
+ * An object of this class, which can be allocated on stack, can be used to
+ * pass non-default parameters to the resizing algorithm. See the constructor
+ * for the default values.
+ */
+
+class CLancIRParams
+{
+public:
+	int SrcSSize; ///< Physical size of the source scanline, in elements (not
+		///< bytes). If this value is below 1, SrcWidth * ElCount will be
+		///< used.
+	int NewSSize; ///< Physical size of the destination scanline, in elements
+		///< (not bytes). If this value is below 1, NewWidth * ElCount will be
+		///< used.
+	double kx; ///< Resizing step - horizontal (one output pixel corresponds
+		///< to `k` input pixels). A downsizing factor if greater than 1.0;
+		///< upsizing factor if below or equal to 1.0. Multiply by -1 if you
+		///< would like to bypass `ox` and `oy` adjustment which is done by
+		///< default to produce a centered image. If this step value equals 0,
+		///< the step value will be chosen automatically.
+	double ky; ///< Resizing step - vertical. Same as `kx`.
+	double ox; ///< Start X pixel offset within the source image, can be
+		///< negative. A positive offset moves the image to the left.
+		///<
+	double oy; ///< Start Y pixel offset within the source image, can be
+		///< negative. A positive offset moves the image to the top.
+		///<
+	double la; ///< Lanczos window function's `a` parameter, greater or equal
+		///< to 2.0.
+		///<
+
+	/**
+	 * Default constructor, with optional arguments that correspond to class
+	 * variables.
+	 *
+	 * @param aSrcSSize Physical size of the source scanline.
+	 * @param aNewSSize Physical size of the destination scanline.
+	 * @param akx Resizing step - horizontal.
+	 * @param aky Resizing step - vertical.
+	 * @param aox Start X pixel offset.
+	 * @param aoy Start Y pixel offset.
+	 */
+
+	CLancIRParams( const int aSrcSSize = 0, const int aNewSSize = 0,
+		const double akx = 0.0, const double aky = 0.0,
+		const double aox = 0.0, const double aoy = 0.0 )
+		: SrcSSize( aSrcSSize )
+		, NewSSize( aNewSSize )
+		, kx( akx )
+		, ky( aky )
+		, ox( aox )
+		, oy( aoy )
+		, la( 3.0 )
+	{
+	}
+};
+
+/**
+ * @brief LANCIR image resizer class.
+ *
+ * The object of this class can be used to resize 1-4 channel images to any
+ * required size. Resizing is performed by utilizing Lanczos filters, with
+ * 8-bit precision. This class offers a kind of "optimal" Lanczos resampling
+ * implementation.
+ *
+ * Object of this class can be allocated on stack.
+ *
+ * Note that object of this class does not free temporary buffers and
+ * variables after the resizeImage() function call (until object's
+ * destruction): these buffers are reused (or reallocated) on subsequent
+ * calls, thus making batch resizing of images faster. This means resizing is
+ * not thread-safe: a separate object should be created for each thread.
+ */
+
+class CLancIR
+{
+private:
+	CLancIR( const CLancIR& )
+	{
+		// Unsupported.
+	}
+
+	CLancIR& operator = ( const CLancIR& )
+	{
+		// Unsupported.
+		return( *this );
+	}
+
+public:
+	CLancIR()
+		: FltBuf0( NULL )
+		, FltBuf0Len( 0 )
+		, spv0( NULL )
+		, spv0len( 0 )
+		, spv( NULL )
+	{
+	}
+
+	~CLancIR()
+	{
+		delete[] FltBuf0;
+		delete[] spv0;
+	}
+
+	/**
+	 * @brief Function resizes an image.
+	 *
+	 * Performs input-to-output type conversion, if necessary.
+	 *
+	 * @param[in] SrcBuf Source image buffer.
+	 * @param SrcWidth Source image width, in pixels.
+	 * @param SrcHeight Source image height, in pixels.
+	 * @param[out] NewBuf Buffer to accept the resized image. Cannot be equal
+	 * to SrcBuf.
+	 * @param NewWidth New image width, in pixels.
+	 * @param NewHeight New image height, in pixels.
+	 * @param ElCount The number of elements (channels) used to store each
+	 * source and destination pixel (1-4).
+	 * @tparam Tin Input buffer's element type. Can be uint8_t (0-255 value
+	 * range), uint16_t (0-65535 value range), float (0-1 value range), double
+	 * (0-1 value range). uint32_t type is treated as uint16_t. Signed integer
+	 * types and larger integer types are unsupported.
+	 * @tparam Tout Output buffer's element type, treated like `Tin`. If `Tin`
+	 * and `Tout` types do not match, an output value scaling will be applied.
+	 * Floating-point output will not clamped/clipped/saturated, integer
+	 * output is always rounded and clamped.
+	 * @return The number of available output scanlines. Equals to NewHeight,
+	 * or 0 on function parameters error.
+	 */
+
+	template< typename Tin, typename Tout >
+	int resizeImage( const Tin* const SrcBuf, const int SrcWidth,
+		const int SrcHeight, Tout* const NewBuf, const int NewWidth,
+		const int NewHeight, const int ElCount,
+		const CLancIRParams* const aParams = NULL )
+	{
+		if(( SrcWidth < 0 ) | ( SrcHeight < 0 ) | ( NewWidth <= 0 ) |
+			( NewHeight <= 0 ) | ( SrcBuf == NULL ) | ( NewBuf == NULL ) |
+			( (const void*) SrcBuf == (const void*) NewBuf ))
+		{
+			return( 0 );
+		}
+
+		static const CLancIRParams DefParams;
+		const CLancIRParams& Params = ( aParams != NULL ?
+			*aParams : DefParams );
+
+		if( Params.la < 2.0 )
+		{
+			return( 0 );
+		}
+
+		const int OutSSize = NewWidth * ElCount;
+		const size_t NewScanlineSize = ( Params.NewSSize < 1 ?
+			OutSSize : Params.NewSSize );
+
+		if(( SrcWidth == 0 ) | ( SrcHeight == 0 ))
+		{
+			Tout* op = NewBuf;
+			int i;
+
+			for( i = 0; i < NewHeight; i++ )
+			{
+				memset( op, 0, OutSSize * sizeof( Tout ));
+				op += NewScanlineSize;
+			}
+
+			return( NewHeight );
+		}
+
+		const size_t SrcScanlineSize = ( Params.SrcSSize < 1 ?
+			SrcWidth * ElCount : Params.SrcSSize );
+
+		double ox = Params.ox;
+		double oy = Params.oy;
+		double kx;
+		double ky;
+
+		if( Params.kx >= 0.0 )
+		{
+			kx = ( Params.kx == 0.0 ?
+				(double) SrcWidth / NewWidth : Params.kx );
+
+			ox += ( kx - 1.0 ) * 0.5;
+		}
+		else
+		{
+			kx = -Params.kx;
+		}
+
+		if( Params.ky >= 0.0 )
+		{
+			ky = ( Params.ky == 0.0 ?
+				(double) SrcHeight / NewHeight : Params.ky );
+
+			oy += ( ky - 1.0 ) * 0.5;
+		}
+		else
+		{
+			ky = -Params.ky;
+		}
+
+		if( rfv.update( Params.la, ky, ElCount ))
+		{
+			rsv.reset();
+			rsh.reset();
+		}
+
+		CResizeFilters* rfh; // Pointer to resizing filters for horizontal
+			// resizing, may equal to `rfv` if the same stepping is in use.
+
+		if( kx == ky )
+		{
+			rfh = &rfv;
+		}
+		else
+		{
+			rfh = &rfh0;
+
+			if( rfh0.update( Params.la, kx, ElCount ))
+			{
+				rsh.reset();
+			}
+		}
+
+		rsv.update( SrcHeight, NewHeight, oy, rfv, spv );
+		rsh.update( SrcWidth, NewWidth, ox, *rfh );
+
+		// Calculate vertical progressive resizing's batch size. Progressive
+		// batching is used to try to keep addressing within the cache
+		// capacity. This technique definitely works well for single-threaded
+		// resizing on most CPUs, but may not provide an additional benefit
+		// for multi-threaded resizing, or in a system-wide high-load
+		// situations.
+
+		const size_t FltWidthE = ( rsh.padl + SrcWidth + rsh.padr ) * ElCount;
+		const double CacheSize = 5500000.0; // Tuned for various CPUs.
+		const double OpSize = (double) SrcScanlineSize * SrcHeight *
+			sizeof( Tin ) + (double) FltWidthE * NewHeight * sizeof( float );
+
+		int BatchSize = (int) ( NewHeight * CacheSize / ( OpSize + 1.0 ));
+
+		if( BatchSize < 8 )
+		{
+			BatchSize = 8;
+		}
+
+		if( BatchSize > NewHeight )
+		{
+			BatchSize = NewHeight;
+		}
+
+		// Allocate/resize intermediate buffers.
+
+		const int svs = ( rsv.padl + SrcHeight + rsv.padr ) * ElCount;
+		float* const pspv0 = spv0;
+		reallocBuf( spv0, spv, spv0len, ( svs > OutSSize ? svs : OutSSize ));
+		reallocBuf( FltBuf0, FltBuf, FltBuf0Len, FltWidthE * BatchSize );
+
+		if( spv0 != pspv0 )
+		{
+			rsv.updateSPO( rfv, spv );
+		}
+
+		// Prepare output-related constants.
+
+		const bool IsOutFloat = ( (Tout) 0.25 != 0 );
+		const int Clamp = ( sizeof( Tout ) == 1 ? 255 : 65535 );
+		const float OutMul = ( IsOutFloat ? 1.0f : (float) Clamp ) /
+			( (Tin) 0.25 != 0 ? 1 : ( sizeof( Tin ) == 1 ? 255 : 65535 ));
+
+		// Perform batched resizing.
+
+		const CResizePos* rpv = rsv.pos;
+		Tout* opn = NewBuf;
+		int bl = NewHeight;
+
+		while( bl > 0 )
+		{
+			const int bc = ( bl > BatchSize ? BatchSize : bl );
+
+			int kl = rfv.KernelLen;
+			const Tin* ip = SrcBuf;
+			float* op = FltBuf + rsh.padl * ElCount;
+
+			const int so = (int) rpv[ 0 ].so;
+			float* const sp = spv + so * ElCount;
+
+			int cc = (int) rpv[ bc - 1 ].so - so + kl; // Pixel copy count.
+			int rl = 0; // Leftmost pixel's replication count.
+			int rr = 0; // Rightmost pixel's replication count.
+
+			const int socc = so + cc;
+			const int spe = rsv.padl + SrcHeight;
+
+			// Calculate scanline copying and padding parameters, depending on
+			// the batch's size and its vertical offset.
+
+			if( so < rsv.padl )
+			{
+				if( socc <= rsv.padl )
+				{
+					rl = cc;
+					cc = 0;
+				}
+				else
+				{
+					if( socc > spe )
+					{
+						rr = socc - spe;
+						cc -= rr;
+					}
+
+					rl = rsv.padl - so;
+					cc -= rl;
+				}
+			}
+			else
+			{
+				if( so >= spe )
+				{
+					rr = cc;
+					cc = 0;
+					ip += SrcHeight * SrcScanlineSize;
+				}
+				else
+				{
+					if( socc > spe )
+					{
+						rr = socc - spe;
+						cc -= rr;
+					}
+
+					ip += ( so - rsv.padl ) * SrcScanlineSize;
+				}
+			}
+
+			// Batched vertical resizing.
+
+			int i;
+
+			if( ElCount == 1 )
+			{
+				for( i = 0; i < SrcWidth; i++ )
+				{
+					copyScanline1v( ip, SrcScanlineSize, sp, cc, rl, rr );
+					resize1< false >( NULL, op, FltWidthE, rpv, kl, bc );
+					ip += 1;
+					op += 1;
+				}
+			}
+			else
+			if( ElCount == 2 )
+			{
+				for( i = 0; i < SrcWidth; i++ )
+				{
+					copyScanline2v( ip, SrcScanlineSize, sp, cc, rl, rr );
+					resize2< false >( NULL, op, FltWidthE, rpv, kl, bc );
+					ip += 2;
+					op += 2;
+				}
+			}
+			else
+			if( ElCount == 3 )
+			{
+				for( i = 0; i < SrcWidth; i++ )
+				{
+					copyScanline3v( ip, SrcScanlineSize, sp, cc, rl, rr );
+					resize3< false >( NULL, op, FltWidthE, rpv, kl, bc );
+					ip += 3;
+					op += 3;
+				}
+			}
+			else // ElCount == 4
+			{
+				for( i = 0; i < SrcWidth; i++ )
+				{
+					copyScanline4v( ip, SrcScanlineSize, sp, cc, rl, rr );
+					resize4< false >( NULL, op, FltWidthE, rpv, kl, bc );
+					ip += 4;
+					op += 4;
+				}
+			}
+
+			// Perform horizontal resizing batch, and produce final output.
+
+			float* ipf = FltBuf;
+			kl = rfh -> KernelLen;
+
+			if( ElCount == 1 )
+			{
+				for( i = 0; i < bc; i++ )
+				{
+					padScanline1h( ipf, rsh, SrcWidth );
+					resize1< true >( ipf, spv, 1, rsh.pos, kl, NewWidth );
+					copyToOutput( spv, opn, OutSSize, Clamp, IsOutFloat,
+						OutMul );
+
+					ipf += FltWidthE;
+					opn += NewScanlineSize;
+				}
+			}
+			else
+			if( ElCount == 2 )
+			{
+				for( i = 0; i < bc; i++ )
+				{
+					padScanline2h( ipf, rsh, SrcWidth );
+					resize2< true >( ipf, spv, 2, rsh.pos, kl, NewWidth );
+					copyToOutput( spv, opn, OutSSize, Clamp, IsOutFloat,
+						OutMul );
+
+					ipf += FltWidthE;
+					opn += NewScanlineSize;
+				}
+			}
+			else
+			if( ElCount == 3 )
+			{
+				for( i = 0; i < bc; i++ )
+				{
+					padScanline3h( ipf, rsh, SrcWidth );
+					resize3< true >( ipf, spv, 3, rsh.pos, kl, NewWidth );
+					copyToOutput( spv, opn, OutSSize, Clamp, IsOutFloat,
+						OutMul );
+
+					ipf += FltWidthE;
+					opn += NewScanlineSize;
+				}
+			}
+			else // ElCount == 4
+			{
+				for( i = 0; i < bc; i++ )
+				{
+					padScanline4h( ipf, rsh, SrcWidth );
+					resize4< true >( ipf, spv, 4, rsh.pos, kl, NewWidth );
+					copyToOutput( spv, opn, OutSSize, Clamp, IsOutFloat,
+						OutMul );
+
+					ipf += FltWidthE;
+					opn += NewScanlineSize;
+				}
+			}
+
+			rpv += bc;
+			bl -= bc;
+		}
+
+		return( NewHeight );
+	}
+
+	/**
+	 * @brief Legacy resizing function. Not recommended for new projects.
+	 *
+	 * See the prior resizeImage() function and CLancIRParams class for
+	 * details.
+	 *
+	 * @param[in] SrcBuf Source image buffer.
+	 * @param SrcWidth Source image width, in pixels.
+	 * @param SrcHeight Source image height, in pixels.
+	 * @param SrcSSize Physical size of the source scanline, in elements (not
+	 * bytes).
+	 * @param[out] NewBuf Buffer to accept the resized image. Cannot be equal
+	 * to SrcBuf.
+	 * @param NewWidth New image width, in pixels.
+	 * @param NewHeight New image height, in pixels.
+	 * @param NewSSize Physical size of the destination scanline, in elements
+	 * (not bytes).
+	 * @param ElCount The number of elements (channels) used to store each
+	 * source and destination pixel (1-4).
+	 * @param kx0 Resizing step - horizontal.
+	 * @param ky0 Resizing step - vertical. Same as `kx0`.
+	 * @param ox Start X pixel offset within the source image.
+	 * @param oy Start Y pixel offset within the source image.
+	 * @tparam Tin Input buffer's element type.
+	 * @tparam Tout Output buffer's element type.
+	 * @return The number of available output scanlines. Equals to NewHeight,
+	 * or 0 on function parameters error.
+	 */
+
+	template< typename Tin, typename Tout >
+	int resizeImage( const Tin* const SrcBuf, const int SrcWidth,
+		const int SrcHeight, const int SrcSSize, Tout* const NewBuf,
+		const int NewWidth, const int NewHeight, const int NewSSize,
+		const int ElCount, const double kx0 = 0.0, const double ky0 = 0.0,
+		double ox = 0.0, double oy = 0.0 )
+	{
+		const CLancIRParams Params( SrcSSize, NewSSize, kx0, ky0, ox, oy );
+
+		return( resizeImage( SrcBuf, SrcWidth, SrcHeight, NewBuf, NewWidth,
+			NewHeight, ElCount, &Params ));
+	}
+
+protected:
+	float* FltBuf0; ///< Intermediate resizing buffer.
+	size_t FltBuf0Len; ///< Length of `FltBuf0`.
+	float* FltBuf; ///< Address-aligned `FltBuf0`.
+	float* spv0; ///< Scanline buffer for vertical resizing, also used at the
+		///< output stage.
+		///<
+	int spv0len; ///< Length of `spv0`.
+	float* spv; ///< Address-aligned `spv0`.
+
+	/**
+	 * Function reallocates a typed buffer if its current length is smaller
+	 * than the required length.
+	 *
+	 * @param buf0 Reference to the pointer of the previously allocated
+	 * buffer.
+	 * @param buf Reference to address-aligned `buf0` pointer.
+	 * @param len The current length of the `buf0`.
+	 * @param newlen A new required length.
+	 * @tparam Tb Buffer element type.
+	 * @tparam Tl Length variable type.
+	 */
+
+	template< typename Tb, typename Tl >
+	static void reallocBuf( Tb*& buf0, Tb*& buf, Tl& len, Tl newlen )
+	{
+		newlen += LANCIR_ALIGN;
+
+		if( newlen > len )
+		{
+			if( buf0 != NULL )
+			{
+				delete[] buf0;
+				buf0 = NULL;
+				len = 0;
+			}
+
+			buf0 = new Tb[ newlen ];
+			len = newlen;
+			buf = (Tb*) (( (uintptr_t) buf0 + LANCIR_ALIGN - 1 ) &
+				~(uintptr_t) ( LANCIR_ALIGN - 1 ));
+		}
+	}
+
+	/**
+	 * Function reallocates a typed buffer if its current length is smaller
+	 * than the required length.
+	 *
+	 * @param buf Reference to the pointer of the previously allocated buffer;
+	 * address alignment will not be applied.
+	 * @param len The current length of the `buf0`.
+	 * @param newlen A new required length.
+	 * @tparam Tb Buffer element type.
+	 * @tparam Tl Length variable type.
+	 */
+
+	template< typename Tb, typename Tl >
+	static void reallocBuf( Tb*& buf, Tl& len, const Tl newlen )
+	{
+		if( newlen > len )
+		{
+			if( buf != NULL )
+			{
+				delete[] buf;
+				buf = NULL;
+				len = 0;
+			}
+
+			buf = new Tb[ newlen ];
+			len = newlen;
+		}
+	}
+
+	class CResizeScanline;
+
+	/**
+	 * Class implements fractional delay filter bank calculation.
+	 */
+
+	class CResizeFilters
+	{
+		friend class CResizeScanline;
+
+	public:
+		int KernelLen; ///< Resampling filter kernel's length, taps. Available
+			///< after the update() function call. Always an even value,
+			///< should not be lesser than 4.
+
+		CResizeFilters()
+			: Filters( NULL )
+			, FiltersLen( 0 )
+			, la( 0.0 )
+		{
+			memset( Bufs0, 0, sizeof( Bufs0 ));
+			memset( Bufs0Len, 0, sizeof( Bufs0Len ));
+		}
+
+		~CResizeFilters()
+		{
+			int i;
+
+			for( i = 0; i < BufCount; i++ )
+			{
+				delete[] Bufs0[ i ];
+			}
+
+			delete[] Filters;
+		}
+
+		/**
+		 * Function updates the resizing filter bank.
+		 *
+		 * @param la0 Lanczos `a` parameter value (greater or equal to 2.0),
+		 * can be fractional.
+		 * @param k0 Resizing step.
+		 * @param ElCount0 Image's element count, may be used for SIMD filter
+		 * tap replication.
+		 * @return `true` if an update occured and scanline resizing positions
+		 * should be updated unconditionally.
+		 */
+
+		bool update( const double la0, const double k0, const int ElCount0 )
+		{
+			if( la0 == la && k0 == k && ElCount0 == ElCount )
+			{
+				return( false );
+			}
+
+			const double NormFreq = ( k0 <= 1.0 ? 1.0 : 1.0 / k0 );
+			Freq = 3.1415926535897932 * NormFreq;
+			FreqA = Freq / la0;
+
+			Len2 = la0 / NormFreq;
+			fl2 = (int) ceil( Len2 );
+			KernelLen = fl2 + fl2;
+
+			#if LANCIR_ALIGN > 4
+
+				ElRepl = ElCount0;
+				KernelLenA = KernelLen * ElRepl;
+
+				const int elalign =
+					(int) ( LANCIR_ALIGN / sizeof( float )) - 1;
+
+				KernelLenA = ( KernelLenA + elalign ) & ~elalign;
+
+			#else // LANCIR_ALIGN > 4
+
+				ElRepl = 1;
+				KernelLenA = KernelLen;
+
+			#endif // LANCIR_ALIGN > 4
+
+			FracCount = 1000; // Enough for Lanczos implicit 8-bit precision.
+
+			la = 0.0;
+			reallocBuf( Filters, FiltersLen, FracCount + 1 );
+
+			memset( Filters, 0, FiltersLen * sizeof( Filters[ 0 ]));
+
+			setBuf( 0 );
+
+			la = la0;
+			k = k0;
+			ElCount = ElCount0;
+
+			return( true );
+		}
+
+		/**
+		 * Function returns filter at the specified fractional offset. This
+		 * function can only be called after a prior update() function call.
+		 *
+		 * @param x Fractional offset, [0; 1].
+		 */
+
+		const float* getFilter( const double x )
+		{
+			const int Frac = (int) ( x * FracCount + 0.5 );
+			float* flt = Filters[ Frac ];
+
+			if( flt != NULL )
+			{
+				return( flt );
+			}
+
+			flt = Bufs[ CurBuf ] + CurBufFill * KernelLenA;
+			Filters[ Frac ] = flt;
+			CurBufFill++;
+
+			if( CurBufFill == BufLen )
+			{
+				setBuf( CurBuf + 1 );
+			}
+
+			makeFilterNorm( flt, 1.0 - (double) Frac / FracCount );
+
+			if( ElRepl > 1 )
+			{
+				replicateFilter( flt, KernelLen, ElRepl );
+			}
+
+			return( flt );
+		}
+
+	protected:
+		double Freq; ///< Circular frequency of the filter.
+		double FreqA; ///< Circular frequency of the window function.
+		double Len2; ///< Half resampling filter's length, unrounded.
+		int fl2; ///< Half resampling filter's length, integer.
+		int FracCount; ///< The number of fractional positions for which
+			///< filters can be created.
+			///<
+		int KernelLenA; ///< SIMD-aligned and replicated filter kernel's
+			///< length.
+			///<
+		int ElRepl; ///< The number of repetitions of each filter tap.
+		static const int BufCount = 4; ///< The maximal number of buffers that
+			///< can be in use.
+			///<
+		static const int BufLen = 256; ///< The number of fractional filters
+			///< a single buffer may contain. Both `BufLen` and `BufCount`
+			///< should correspond to the `FracCount` used.
+		float* Bufs0[ BufCount ]; ///< Buffers that hold all filters,
+			///< original.
+			///<
+		int Bufs0Len[ BufCount ]; ///< Allocated lengthes in `Bufs0`, in
+			///< `float` elements.
+			///<
+		float* Bufs[ BufCount ]; ///< Address-aligned `Bufs0`.
+		int CurBuf; ///< Filter buffer currently being filled.
+		int CurBufFill; ///< The number of fractional positions filled in the
+			///< current filter buffer.
+			///<
+		float** Filters; ///< Fractional delay filters for all positions.
+			///< A particular pointer equals NULL if a filter for such
+			///< position has not been created yet.
+		int FiltersLen; ///< Allocated length of Filters, in elements.
+		double la; ///< Current `la`.
+		double k; ///< Current `k`.
+		int ElCount; ///< Current `ElCount`.
+
+		/**
+		 * Function changes the buffer currently being filled, check its
+		 * size and reallocates it if necessary, then resets its fill counter.
+		 *
+		 * @param bi New current buffer index.
+		 */
+
+		void setBuf( const int bi )
+		{
+			reallocBuf( Bufs0[ bi ], Bufs[ bi ], Bufs0Len[ bi ],
+				BufLen * KernelLenA );
+
+			CurBuf = bi;
+			CurBufFill = 0;
+		}
+
+		/**
+		 * @brief Sine-wave signal generator class.
+		 *
+		 * Class implements sine-wave signal generator without biasing, with
+		 * constructor-based initialization only. This generator uses an
+		 * oscillator instead of the `sin` function.
+		 */
+
+		class CSineGen
+		{
+		public:
+			/**
+			 * Constructor initializes `this` sine-wave signal generator.
+			 *
+			 * @param si Sine function increment, in radians.
+			 * @param ph Starting phase, in radians. Add `0.5 x PI` for a
+			 * cosine function.
+			 */
+
+			CSineGen( const double si, const double ph )
+				: svalue1( sin( ph ))
+				, svalue2( sin( ph - si ))
+				, sincr( 2.0 * cos( si ))
+			{
+			}
+
+			/**
+			 * @return The next value of the sine-wave, without biasing.
+			 */
+
+			double generate()
+			{
+				const double res = svalue1;
+
+				svalue1 = sincr * res - svalue2;
+				svalue2 = res;
+
+				return( res );
+			}
+
+		private:
+			double svalue1; ///< Current sine value.
+			double svalue2; ///< Previous sine value.
+			double sincr; ///< Sine value increment.
+		};
+
+		/**
+		 * Function creates a filter for the specified fractional delay. The
+		 * update() function should be called prior to calling this function.
+		 * The created filter is normalized (DC gain=1).
+		 *
+		 * @param[out] op Output filter buffer.
+		 * @param FracDelay Fractional delay, 0 to 1, inclusive.
+		 */
+
+		void makeFilterNorm( float* op, const double FracDelay ) const
+		{
+			CSineGen f( Freq, Freq * ( FracDelay - fl2 ));
+			CSineGen fw( FreqA, FreqA * ( FracDelay - fl2 ));
+
+			float* op0 = op;
+			double s = 0.0;
+			double ut;
+
+			int t = -fl2;
+
+			if( t + FracDelay < -Len2 )
+			{
+				f.generate();
+				fw.generate();
+				*op = (float) 0;
+				op++;
+				t++;
+			}
+
+			int IsZeroX = ( fabs( FracDelay - 1.0 ) < 2.3e-13 );
+			int mt = 0 - IsZeroX;
+			IsZeroX |= ( fabs( FracDelay ) < 2.3e-13 );
+
+			while( t < mt )
+			{
+				ut = t + FracDelay;
+				*op = (float) ( f.generate() * fw.generate() / ( ut * ut ));
+				s += *op;
+				op++;
+				t++;
+			}
+
+			if( IsZeroX ) // t+FracDelay==0
+			{
+				*op = (float) ( Freq * FreqA );
+				s += *op;
+				f.generate();
+				fw.generate();
+			}
+			else
+			{
+				ut = FracDelay; // t==0
+				*op = (float) ( f.generate() * fw.generate() / ( ut * ut ));
+				s += *op;
+			}
+
+			mt = fl2 - 2;
+
+			while( t < mt )
+			{
+				op++;
+				t++;
+				ut = t + FracDelay;
+				*op = (float) ( f.generate() * fw.generate() / ( ut * ut ));
+				s += *op;
+			}
+
+			op++;
+			ut = t + 1 + FracDelay;
+
+			if( ut > Len2 )
+			{
+				*op = (float) 0;
+			}
+			else
+			{
+				*op = (float) ( f.generate() * fw.generate() / ( ut * ut ));
+				s += *op;
+			}
+
+			s = 1.0 / s;
+			t = (int) ( op - op0 + 1 );
+
+			while( t != 0 )
+			{
+				*op0 = (float) ( *op0 * s );
+				op0++;
+				t--;
+			}
+		}
+
+		/**
+		 * Function replicates taps of the specified filter so that it can
+		 * be used with SIMD loading instructions. This function works
+		 * "in-place".
+		 *
+		 * @param[in,out] p Filter buffer pointer, should be sized to contain
+		 * `kl * erp` elements.
+		 * @param kl Filter kernel's length, in taps.
+		 * @param erp The number of repetitions to apply.
+		 */
+
+		static void replicateFilter( float* const p, const int kl,
+			const int erp )
+		{
+			const float* ip = p + kl - 1;
+			float* op = p + ( kl - 1 ) * erp;
+			int c = kl;
+
+			if( erp == 2 )
+			{
+				while( c != 0 )
+				{
+					const float v = *ip;
+					op[ 0 ] = v;
+					op[ 1 ] = v;
+					ip--;
+					op -= 2;
+					c--;
+				}
+			}
+			else
+			if( erp == 3 )
+			{
+				while( c != 0 )
+				{
+					const float v = *ip;
+					op[ 0 ] = v;
+					op[ 1 ] = v;
+					op[ 2 ] = v;
+					ip--;
+					op -= 3;
+					c--;
+				}
+			}
+			else // erp == 4
+			{
+				while( c != 0 )
+				{
+					const float v = *ip;
+					op[ 0 ] = v;
+					op[ 1 ] = v;
+					op[ 2 ] = v;
+					op[ 3 ] = v;
+					ip--;
+					op -= 4;
+					c--;
+				}
+			}
+		}
+	};
+
+	/**
+	 * Structure defines source scanline positions and filters for each
+	 * destination pixel.
+	 */
+
+	struct CResizePos
+	{
+		const float* flt; ///< Fractional delay filter.
+		intptr_t spo; ///< Source scanline's pixel offset, in bytes, or
+			///< a direct pointer to scanline buffer.
+			///<
+		intptr_t so; ///< Offset within the source scanline, in pixels.
+	};
+
+	/**
+	 * Class contains resizing positions, and prepares source scanline
+	 * positions for resize filtering. The public variables become available
+	 * after the update() function call.
+	 */
+
+	class CResizeScanline
+	{
+	public:
+		int padl; ///< Left-padding (in pixels) required for source scanline.
+		int padr; ///< Right-padding (in pixels) required for source scanline.
+		CResizePos* pos; ///< Source scanline positions (offsets) and filters
+			///< for each destination pixel position.
+			///<
+
+		CResizeScanline()
+			: pos( NULL )
+			, poslen( 0 )
+			, SrcLen( 0 )
+		{
+		}
+
+		~CResizeScanline()
+		{
+			delete[] pos;
+		}
+
+		/**
+		 * Function "resets" `this` object so that the next update() call
+		 * fully updates the position buffer. Reset is necessary if the filter
+		 * object was updated.
+		 */
+
+		void reset()
+		{
+			SrcLen = 0;
+		}
+
+		/**
+		 * Function updates resizing positions, updates `padl`, `padr`, and
+		 * `pos` buffer.
+		 *
+		 * @param SrcLen0 Source image scanline length, used to create a
+		 * scanline buffer without length pre-calculation.
+		 * @param DstLen0 Destination image scanline length.
+		 * @param o0 Initial source image offset.
+		 * @param rf Resizing filters object.
+		 * @param sp A pointer to scanline buffer, to use for absolute
+		 * scanline positioning, can be NULL.
+		 */
+
+		void update( const int SrcLen0, const int DstLen0, const double o0,
+			CResizeFilters& rf, float* const sp = NULL )
+		{
+			if( SrcLen0 == SrcLen && DstLen0 == DstLen && o0 == o )
+			{
+				return;
+			}
+
+			const int fl2m1 = rf.fl2 - 1;
+			padl = fl2m1 - (int) floor( o0 );
+
+			if( padl < 0 )
+			{
+				padl = 0;
+			}
+
+			// Make sure `padr` and `pos` are in sync: calculate ending `pos`
+			// offset in advance.
+
+			const double k = rf.k;
+
+			const int DstLen_m1 = DstLen0 - 1;
+			const double oe = o0 + k * DstLen_m1;
+			const int ie = (int) floor( oe );
+
+			padr = ie + rf.fl2 + 1 - SrcLen0;
+
+			if( padr < 0 )
+			{
+				padr = 0;
+			}
+
+			SrcLen = 0;
+			reallocBuf( pos, poslen, DstLen0 );
+
+			const intptr_t ElCountF = rf.ElCount * sizeof( float );
+			const int so = padl - fl2m1;
+			CResizePos* rp = pos;
+			intptr_t rpso;
+			int i;
+
+			for( i = 0; i < DstLen_m1; i++ )
+			{
+				const double ox = o0 + k * i;
+				const int ix = (int) floor( ox );
+
+				rp -> flt = rf.getFilter( ox - ix );
+				rpso = so + ix;
+				rp -> spo = (intptr_t) sp + rpso * ElCountF;
+				rp -> so = rpso;
+				rp++;
+			}
+
+			rp -> flt = rf.getFilter( oe - ie );
+			rpso = so + ie;
+			rp -> spo = (intptr_t) sp + rpso * ElCountF;
+			rp -> so = rpso;
+
+			SrcLen = SrcLen0;
+			DstLen = DstLen0;
+			o = o0;
+		}
+
+		/**
+		 * Function updates `pos` buffer's `spo` values.
+		 *
+		 * @param rf Resizing filters object.
+		 * @param sp A pointer to scanline buffer, to use for absolute
+		 * scanline positioning, can be NULL.
+		 */
+
+		void updateSPO( CResizeFilters& rf, float* const sp )
+		{
+			const intptr_t ElCountF = rf.ElCount * sizeof( float );
+			CResizePos* const rp = pos;
+			int i;
+
+			for( i = 0; i < DstLen; i++ )
+			{
+				rp[ i ].spo = (intptr_t) sp + rp[ i ].so * ElCountF;
+			}
+		}
+
+	protected:
+		int poslen; ///< Allocated `pos` buffer's length.
+		int SrcLen; ///< Current `SrcLen`.
+		int DstLen; ///< Current `DstLen`.
+		double o; ///< Current `o`.
+	};
+
+	CResizeFilters rfv; ///< Resizing filters for vertical resizing.
+	CResizeFilters rfh0; ///< Resizing filters for horizontal resizing (may
+		///< not be in use).
+		///<
+	CResizeScanline rsv; ///< Vertical resize scanline.
+	CResizeScanline rsh; ///< Horizontal resize scanline.
+
+	/**
+	 * Function copies scanline (fully or partially) from the source buffer,
+	 * in its native format, to the internal scanline buffer, in preparation
+	 * for vertical resizing. Variants for 1-4-channel images.
+	 *
+	 * @param ip Source scanline buffer pointer.
+	 * @param ipinc `ip` increment per pixel.
+	 * @param op Output scanline pointer.
+	 * @param cc Source pixel copy count.
+	 * @param repl Leftmost pixel's replication count.
+	 * @param repr Rightmost pixel's replication count.
+	 * @tparam T Source buffer's element type.
+	 */
+
+	template< typename T >
+	static void copyScanline1v( const T* ip, const size_t ipinc, float* op,
+		int cc, int repl, int repr )
+	{
+		float v0;
+
+		if( repl > 0 )
+		{
+			v0 = (float) ip[ 0 ];
+
+			do
+			{
+				op[ 0 ] = v0;
+				op += 1;
+
+			} while( --repl != 0 );
+		}
+
+		while( cc != 0 )
+		{
+			op[ 0 ] = (float) ip[ 0 ];
+			ip += ipinc;
+			op += 1;
+			cc--;
+		}
+
+		if( repr > 0 )
+		{
+			const T* const ipe = ip - ipinc;
+			v0 = (float) ipe[ 0 ];
+
+			do
+			{
+				op[ 0 ] = v0;
+				op += 1;
+
+			} while( --repr != 0 );
+		}
+	}
+
+	template< typename T >
+	static void copyScanline2v( const T* ip, const size_t ipinc, float* op,
+		int cc, int repl, int repr )
+	{
+		float v0, v1;
+
+		if( repl > 0 )
+		{
+			v0 = (float) ip[ 0 ];
+			v1 = (float) ip[ 1 ];
+
+			do
+			{
+				op[ 0 ] = v0;
+				op[ 1 ] = v1;
+				op += 2;
+
+			} while( --repl != 0 );
+		}
+
+		while( cc != 0 )
+		{
+			op[ 0 ] = (float) ip[ 0 ];
+			op[ 1 ] = (float) ip[ 1 ];
+			ip += ipinc;
+			op += 2;
+			cc--;
+		}
+
+		if( repr > 0 )
+		{
+			const T* const ipe = ip - ipinc;
+			v0 = (float) ipe[ 0 ];
+			v1 = (float) ipe[ 1 ];
+
+			do
+			{
+				op[ 0 ] = v0;
+				op[ 1 ] = v1;
+				op += 2;
+
+			} while( --repr != 0 );
+		}
+	}
+
+	template< typename T >
+	static void copyScanline3v( const T* ip, const size_t ipinc, float* op,
+		int cc, int repl, int repr )
+	{
+		float v0, v1, v2;
+
+		if( repl > 0 )
+		{
+			v0 = (float) ip[ 0 ];
+			v1 = (float) ip[ 1 ];
+			v2 = (float) ip[ 2 ];
+
+			do
+			{
+				op[ 0 ] = v0;
+				op[ 1 ] = v1;
+				op[ 2 ] = v2;
+				op += 3;
+
+			} while( --repl != 0 );
+		}
+
+		while( cc != 0 )
+		{
+			op[ 0 ] = (float) ip[ 0 ];
+			op[ 1 ] = (float) ip[ 1 ];
+			op[ 2 ] = (float) ip[ 2 ];
+			ip += ipinc;
+			op += 3;
+			cc--;
+		}
+
+		if( repr > 0 )
+		{
+			const T* const ipe = ip - ipinc;
+			v0 = (float) ipe[ 0 ];
+			v1 = (float) ipe[ 1 ];
+			v2 = (float) ipe[ 2 ];
+
+			do
+			{
+				op[ 0 ] = v0;
+				op[ 1 ] = v1;
+				op[ 2 ] = v2;
+				op += 3;
+
+			} while( --repr != 0 );
+		}
+	}
+
+	template< typename T >
+	static void copyScanline4v( const T* ip, const size_t ipinc, float* op,
+		int cc, int repl, int repr )
+	{
+		float v0, v1, v2, v3;
+
+		if( repl > 0 )
+		{
+			v0 = (float) ip[ 0 ];
+			v1 = (float) ip[ 1 ];
+			v2 = (float) ip[ 2 ];
+			v3 = (float) ip[ 3 ];
+
+			do
+			{
+				op[ 0 ] = v0;
+				op[ 1 ] = v1;
+				op[ 2 ] = v2;
+				op[ 3 ] = v3;
+				op += 4;
+
+			} while( --repl != 0 );
+		}
+
+		while( cc != 0 )
+		{
+			op[ 0 ] = (float) ip[ 0 ];
+			op[ 1 ] = (float) ip[ 1 ];
+			op[ 2 ] = (float) ip[ 2 ];
+			op[ 3 ] = (float) ip[ 3 ];
+			ip += ipinc;
+			op += 4;
+			cc--;
+		}
+
+		if( repr > 0 )
+		{
+			const T* const ipe = ip - ipinc;
+			v0 = (float) ipe[ 0 ];
+			v1 = (float) ipe[ 1 ];
+			v2 = (float) ipe[ 2 ];
+			v3 = (float) ipe[ 3 ];
+
+			do
+			{
+				op[ 0 ] = v0;
+				op[ 1 ] = v1;
+				op[ 2 ] = v2;
+				op[ 3 ] = v3;
+				op += 4;
+
+			} while( --repr != 0 );
+		}
+	}
+
+	/**
+	 * Function pads the specified scanline buffer to the left and right by
+	 * replicating its first and last available pixels, in preparation for
+	 * horizontal resizing. Variants for 1-4-channel images.
+	 *
+	 * @param[in,out] op Scanline buffer to pad.
+	 * @param rs Scanline resizing positions object.
+	 * @param l Source scanline's length, in pixels.
+	 */
+
+	static void padScanline1h( float* op, CResizeScanline& rs, const int l )
+	{
+		const float* ip = op + rs.padl;
+
+		float v0 = ip[ 0 ];
+		int i;
+
+		for( i = 0; i < rs.padl; i++ )
+		{
+			op[ i ] = v0;
+		}
+
+		ip += l;
+		op += rs.padl + l;
+
+		v0 = ip[ -1 ];
+
+		for( i = 0; i < rs.padr; i++ )
+		{
+			op[ i ] = v0;
+		}
+	}
+
+	static void padScanline2h( float* op, CResizeScanline& rs, const int l )
+	{
+		const float* ip = op + rs.padl * 2;
+
+		float v0 = ip[ 0 ];
+		float v1 = ip[ 1 ];
+		int i;
+
+		for( i = 0; i < rs.padl; i++ )
+		{
+			op[ 0 ] = v0;
+			op[ 1 ] = v1;
+			op += 2;
+		}
+
+		const int lc = l * 2;
+		ip += lc;
+		op += lc;
+
+		v0 = ip[ -2 ];
+		v1 = ip[ -1 ];
+
+		for( i = 0; i < rs.padr; i++ )
+		{
+			op[ 0 ] = v0;
+			op[ 1 ] = v1;
+			op += 2;
+		}
+	}
+
+	static void padScanline3h( float* op, CResizeScanline& rs, const int l )
+	{
+		const float* ip = op + rs.padl * 3;
+
+		float v0 = ip[ 0 ];
+		float v1 = ip[ 1 ];
+		float v2 = ip[ 2 ];
+		int i;
+
+		for( i = 0; i < rs.padl; i++ )
+		{
+			op[ 0 ] = v0;
+			op[ 1 ] = v1;
+			op[ 2 ] = v2;
+			op += 3;
+		}
+
+		const int lc = l * 3;
+		ip += lc;
+		op += lc;
+
+		v0 = ip[ -3 ];
+		v1 = ip[ -2 ];
+		v2 = ip[ -1 ];
+
+		for( i = 0; i < rs.padr; i++ )
+		{
+			op[ 0 ] = v0;
+			op[ 1 ] = v1;
+			op[ 2 ] = v2;
+			op += 3;
+		}
+	}
+
+	static void padScanline4h( float* op, CResizeScanline& rs, const int l )
+	{
+		const float* ip = op + rs.padl * 4;
+
+		float v0 = ip[ 0 ];
+		float v1 = ip[ 1 ];
+		float v2 = ip[ 2 ];
+		float v3 = ip[ 3 ];
+		int i;
+
+		for( i = 0; i < rs.padl; i++ )
+		{
+			op[ 0 ] = v0;
+			op[ 1 ] = v1;
+			op[ 2 ] = v2;
+			op[ 3 ] = v3;
+			op += 4;
+		}
+
+		const int lc = l * 4;
+		ip += lc;
+		op += lc;
+
+		v0 = ip[ -4 ];
+		v1 = ip[ -3 ];
+		v2 = ip[ -2 ];
+		v3 = ip[ -1 ];
+
+		for( i = 0; i < rs.padr; i++ )
+		{
+			op[ 0 ] = v0;
+			op[ 1 ] = v1;
+			op[ 2 ] = v2;
+			op[ 3 ] = v3;
+			op += 4;
+		}
+	}
+
+	/**
+	 * Function rounds a value and applies clamping.
+	 *
+	 * @param v Value to round and clamp.
+	 * @param Clamp High clamp level; low level is 0.
+	 */
+
+	static inline int roundclamp( const float v, const int Clamp )
+	{
+		if( v < 0.5f )
+		{
+			return( 0 );
+		}
+
+		const int vr = (int) ( v + 0.5f );
+
+		return( vr > Clamp ? Clamp : vr );
+	}
+
+	/**
+	 * Function performs final output of the resized scanline pixels to the
+	 * destination image buffer.
+	 *
+	 * @param[in] ip Input resized scanline.
+	 * @param[out] op Output image buffer.
+	 * @param l Output scanline's size (not pixel count).
+	 * @param Clamp Clamp high level, used if `IsOutFloat` is `false`.
+	 * @param IsOutFloat `true` if floating-point output, and no clamping is
+	 * necessary.
+	 * @param OutMul Output multiplier, for value range conversion.
+	 * @tparam T Output buffer's element type.
+	 */
+
+	template< typename T >
+	static void copyToOutput( const float* ip, T* op, int l, const int Clamp,
+		const bool IsOutFloat, const float OutMul )
+	{
+		const bool IsUnityMul = ( OutMul == 1.0f );
+
+		if( IsOutFloat )
+		{
+			if( IsUnityMul )
+			{
+				if( sizeof( op[ 0 ]) == sizeof( ip[ 0 ]))
+				{
+					memcpy( op, ip, l * sizeof( op[ 0 ]));
+					return;
+				}
+				else
+				{
+					int l4 = l >> 2;
+					l &= 3;
+
+					while( l4 != 0 )
+					{
+						op[ 0 ] = (T) ip[ 0 ];
+						op[ 1 ] = (T) ip[ 1 ];
+						op[ 2 ] = (T) ip[ 2 ];
+						op[ 3 ] = (T) ip[ 3 ];
+						ip += 4;
+						op += 4;
+						l4--;
+					}
+
+					while( l != 0 )
+					{
+						*op = (T) *ip;
+						ip++;
+						op++;
+						l--;
+					}
+
+					return;
+				}
+			}
+
+			int l4 = l >> 2;
+			l &= 3;
+			bool DoScalar = true;
+
+			if( sizeof( op[ 0 ]) == sizeof( ip[ 0 ]))
+			{
+			#if defined( LANCIR_SSE2 )
+
+				DoScalar = false;
+				const __m128 om = _mm_set1_ps( OutMul );
+
+				while( l4 != 0 )
+				{
+					_mm_storeu_ps( (float*) op,
+						_mm_mul_ps( _mm_load_ps( ip ), om ));
+
+					ip += 4;
+					op += 4;
+					l4--;
+				}
+
+			#elif defined( LANCIR_NEON )
+
+				DoScalar = false;
+				const float32x4_t om = vdupq_n_f32( OutMul );
+
+				while( l4 != 0 )
+				{
+					vst1q_f32( (float*) op,
+						vmulq_f32( vld1q_f32( ip ), om ));
+
+					ip += 4;
+					op += 4;
+					l4--;
+				}
+
+			#endif // defined( LANCIR_NEON )
+			}
+
+			if( DoScalar )
+			{
+				while( l4 != 0 )
+				{
+					op[ 0 ] = (T) ( ip[ 0 ] * OutMul );
+					op[ 1 ] = (T) ( ip[ 1 ] * OutMul );
+					op[ 2 ] = (T) ( ip[ 2 ] * OutMul );
+					op[ 3 ] = (T) ( ip[ 3 ] * OutMul );
+					ip += 4;
+					op += 4;
+					l4--;
+				}
+			}
+
+			while( l != 0 )
+			{
+				*op = (T) ( *ip * OutMul );
+				ip++;
+				op++;
+				l--;
+			}
+
+			return;
+		}
+
+		int l4 = l >> 2;
+		l &= 3;
+
+	#if defined( LANCIR_SSE2 )
+
+		const __m128 minv = _mm_setzero_ps();
+		const __m128 maxv = _mm_set1_ps( (float) Clamp );
+		const __m128 om = _mm_set1_ps( OutMul );
+
+		unsigned int prevrm = _MM_GET_ROUNDING_MODE();
+		_MM_SET_ROUNDING_MODE( _MM_ROUND_NEAREST );
+
+		if( sizeof( op[ 0 ]) == 4 )
+		{
+			while( l4 != 0 )
+			{
+				const __m128 cv = _mm_max_ps( _mm_min_ps(
+					_mm_mul_ps( _mm_load_ps( ip ), om ), maxv ), minv );
+
+				_mm_storeu_si128( (__m128i*) op, _mm_cvtps_epi32( cv ));
+
+				ip += 4;
+				op += 4;
+				l4--;
+			}
+		}
+		else
+		if( sizeof( op[ 0 ]) == 2 )
+		{
+			while( l4 != 0 )
+			{
+				const __m128 cv = _mm_max_ps( _mm_min_ps(
+					_mm_mul_ps( _mm_load_ps( ip ), om ), maxv ), minv );
+
+				const __m128i v32 = _mm_cvtps_epi32( cv );
+				const __m128i v16s = _mm_shufflehi_epi16(
+					_mm_shufflelo_epi16( v32, 0 | 2 << 2 ), 0 | 2 << 2 );
+
+				const __m128i v16 = _mm_shuffle_epi32( v16s, 0 | 2 << 2 );
+
+				uint64_t tmp[ 2 ];
+				_mm_storeu_si128( (__m128i*) tmp, v16 );
+				*(uint64_t*) op = tmp[ 0 ];
+
+				ip += 4;
+				op += 4;
+				l4--;
+			}
+		}
+		else
+		{
+			while( l4 != 0 )
+			{
+				const __m128 cv = _mm_max_ps( _mm_min_ps(
+					_mm_mul_ps( _mm_load_ps( ip ), om ), maxv ), minv );
+
+				const __m128i v32 = _mm_cvtps_epi32( cv );
+				const __m128i v16s = _mm_shufflehi_epi16(
+					_mm_shufflelo_epi16( v32, 0 | 2 << 2 ), 0 | 2 << 2 );
+
+				const __m128i v16 = _mm_shuffle_epi32( v16s, 0 | 2 << 2 );
+				const __m128i v8 = _mm_packus_epi16( v16, v16 );
+
+				*(uint32_t*) op = (uint32_t) _mm_cvtsi128_si32( v8 );
+
+				ip += 4;
+				op += 4;
+				l4--;
+			}
+		}
+
+		_MM_SET_ROUNDING_MODE( prevrm );
+
+	#elif defined( LANCIR_NEON )
+
+		const float32x4_t minv = vdupq_n_f32( 0.0f );
+		const float32x4_t maxv = vdupq_n_f32( (float) Clamp );
+		const float32x4_t om = vdupq_n_f32( OutMul );
+		const float32x4_t v05 = vdupq_n_f32( 0.5f );
+
+		if( sizeof( op[ 0 ]) == 4 )
+		{
+			while( l4 != 0 )
+			{
+				const float32x4_t cv = vmaxq_f32( vminq_f32(
+					vmulq_f32( vld1q_f32( ip ), om ), maxv ), minv );
+
+				vst1q_u32( (uint32_t*) op, vcvtq_u32_f32( vaddq_f32(
+					cv, v05 )));
+
+				ip += 4;
+				op += 4;
+				l4--;
+			}
+		}
+		else
+		if( sizeof( op[ 0 ]) == 2 )
+		{
+			while( l4 != 0 )
+			{
+				const float32x4_t cv = vmaxq_f32( vminq_f32(
+					vmulq_f32( vld1q_f32( ip ), om ), maxv ), minv );
+
+				const uint32x4_t v32 = vcvtq_u32_f32( vaddq_f32( cv, v05 ));
+				const uint16x4_t v16 = vmovn_u32( v32 );
+
+				vst1_u16( (uint16_t*) op, v16 );
+
+				ip += 4;
+				op += 4;
+				l4--;
+			}
+		}
+		else
+		{
+			while( l4 != 0 )
+			{
+				const float32x4_t cv = vmaxq_f32( vminq_f32(
+					vmulq_f32( vld1q_f32( ip ), om ), maxv ), minv );
+
+				const uint32x4_t v32 = vcvtq_u32_f32( vaddq_f32( cv, v05 ));
+				const uint16x4_t v16 = vmovn_u32( v32 );
+				const uint8x8_t v8 = vmovn_u16( vcombine_u16( v16, v16 ));
+
+				*(uint32_t*) op = vget_lane_u32( (uint32x2_t) v8, 0 );
+
+				ip += 4;
+				op += 4;
+				l4--;
+			}
+		}
+
+	#else // defined( LANCIR_NEON )
+
+		if( IsUnityMul )
+		{
+			while( l4 != 0 )
+			{
+				op[ 0 ] = (T) roundclamp( ip[ 0 ], Clamp );
+				op[ 1 ] = (T) roundclamp( ip[ 1 ], Clamp );
+				op[ 2 ] = (T) roundclamp( ip[ 2 ], Clamp );
+				op[ 3 ] = (T) roundclamp( ip[ 3 ], Clamp );
+				ip += 4;
+				op += 4;
+				l4--;
+			}
+		}
+		else
+		{
+			while( l4 != 0 )
+			{
+				op[ 0 ] = (T) roundclamp( ip[ 0 ] * OutMul, Clamp );
+				op[ 1 ] = (T) roundclamp( ip[ 1 ] * OutMul, Clamp );
+				op[ 2 ] = (T) roundclamp( ip[ 2 ] * OutMul, Clamp );
+				op[ 3 ] = (T) roundclamp( ip[ 3 ] * OutMul, Clamp );
+				ip += 4;
+				op += 4;
+				l4--;
+			}
+		}
+
+	#endif // defined( LANCIR_NEON )
+
+		if( IsUnityMul )
+		{
+			while( l != 0 )
+			{
+				*op = (T) roundclamp( *ip, Clamp );
+				ip++;
+				op++;
+				l--;
+			}
+		}
+		else
+		{
+			while( l != 0 )
+			{
+				*op = (T) roundclamp( *ip * OutMul, Clamp );
+				ip++;
+				op++;
+				l--;
+			}
+		}
+	}
+
+	#define LANCIR_LF_PRE \
+			const CResizePos* const rpe = rp + DstLen; \
+			while( rp != rpe ) \
+			{ \
+				const float* flt = rp -> flt; \
+				const float* ip; \
+				if( UseSP ) \
+				{ \
+					ip = (float*) ( (intptr_t) sp + rp -> spo ); \
+				} \
+				else \
+				{ \
+					ip = (float*) rp -> spo; \
+				}
+
+	#define LANCIR_LF_POST \
+				op += opinc; \
+				rp++; \
+			}
+
+	/**
+	 * Function performs scanline resizing. Variants for 1-4-channel images.
+	 *
+	 * @param[in] sp Source scanline buffer.
+	 * @param[out] op Destination buffer.
+	 * @param opinc `op` increment.
+	 * @param rp Source scanline offsets and resizing filters.
+	 * @param kl Filter kernel's length, in taps (always an even value).
+	 * @param DstLen Destination length, in pixels.
+	 * @tparam UseSP `true` if `sp` pointer should be added to `spo`.
+	 */
+
+	template< bool UseSP >
+	static void resize1( const float* const sp, float* op, const size_t opinc,
+		const CResizePos* rp, const int kl, const int DstLen )
+	{
+		const int ci = kl >> 2;
+
+		if(( kl & 3 ) == 0 )
+		{
+			LANCIR_LF_PRE
+
+			int c = ci;
+
+		#if defined( LANCIR_SSE2 )
+
+			__m128 sum = _mm_mul_ps( _mm_load_ps( flt ), _mm_loadu_ps( ip ));
+
+			while( --c != 0 )
+			{
+				flt += 4;
+				ip += 4;
+				sum = _mm_add_ps( sum, _mm_mul_ps( _mm_load_ps( flt ),
+					_mm_loadu_ps( ip )));
+			}
+
+			sum = _mm_add_ps( sum, _mm_movehl_ps( sum, sum ));
+
+			_mm_store_ss( op, _mm_add_ss( sum,
+				_mm_shuffle_ps( sum, sum, 1 )));
+
+		#elif defined( LANCIR_NEON )
+
+			float32x4_t sum = vmulq_f32( vld1q_f32( flt ), vld1q_f32( ip ));
+
+			while( --c != 0 )
+			{
+				flt += 4;
+				ip += 4;
+				sum = vmlaq_f32( sum, vld1q_f32( flt ), vld1q_f32( ip ));
+			}
+
+			op[ 0 ] = vaddvq_f32( sum );
+
+		#else // defined( LANCIR_NEON )
+
+			float sum0 = flt[ 0 ] * ip[ 0 ];
+			float sum1 = flt[ 1 ] * ip[ 1 ];
+			float sum2 = flt[ 2 ] * ip[ 2 ];
+			float sum3 = flt[ 3 ] * ip[ 3 ];
+
+			while( --c != 0 )
+			{
+				flt += 4;
+				ip += 4;
+				sum0 += flt[ 0 ] * ip[ 0 ];
+				sum1 += flt[ 1 ] * ip[ 1 ];
+				sum2 += flt[ 2 ] * ip[ 2 ];
+				sum3 += flt[ 3 ] * ip[ 3 ];
+			}
+
+			op[ 0 ] = ( sum0 + sum1 ) + ( sum2 + sum3 );
+
+		#endif // defined( LANCIR_NEON )
+
+			LANCIR_LF_POST
+		}
+		else
+		{
+			LANCIR_LF_PRE
+
+			int c = ci;
+
+		#if defined( LANCIR_SSE2 )
+
+			__m128 sum = _mm_mul_ps( _mm_load_ps( flt ), _mm_loadu_ps( ip ));
+
+			while( --c != 0 )
+			{
+				flt += 4;
+				ip += 4;
+				sum = _mm_add_ps( sum, _mm_mul_ps( _mm_load_ps( flt ),
+					_mm_loadu_ps( ip )));
+			}
+
+			sum = _mm_add_ps( sum, _mm_movehl_ps( sum, sum ));
+
+			const __m128 sum2 = _mm_mul_ps( _mm_loadu_ps( flt + 2 ),
+				_mm_loadu_ps( ip + 2 ));
+
+			sum = _mm_add_ps( sum, _mm_movehl_ps( sum2, sum2 ));
+
+			_mm_store_ss( op, _mm_add_ss( sum,
+				_mm_shuffle_ps( sum, sum, 1 )));
+
+		#elif defined( LANCIR_NEON )
+
+			float32x4_t sum = vmulq_f32( vld1q_f32( flt ), vld1q_f32( ip ));
+
+			while( --c != 0 )
+			{
+				flt += 4;
+				ip += 4;
+				sum = vmlaq_f32( sum, vld1q_f32( flt ), vld1q_f32( ip ));
+			}
+
+			const float32x2_t sum2 = vadd_f32( vget_high_f32( sum ),
+				vget_low_f32( sum ));
+
+			op[ 0 ] = vaddv_f32( vmla_f32( sum2, vld1_f32( flt + 4 ),
+				vld1_f32( ip + 4 )));
+
+		#else // defined( LANCIR_NEON )
+
+			float sum0 = flt[ 0 ] * ip[ 0 ];
+			float sum1 = flt[ 1 ] * ip[ 1 ];
+			float sum2 = flt[ 2 ] * ip[ 2 ];
+			float sum3 = flt[ 3 ] * ip[ 3 ];
+
+			while( --c != 0 )
+			{
+				flt += 4;
+				ip += 4;
+				sum0 += flt[ 0 ] * ip[ 0 ];
+				sum1 += flt[ 1 ] * ip[ 1 ];
+				sum2 += flt[ 2 ] * ip[ 2 ];
+				sum3 += flt[ 3 ] * ip[ 3 ];
+			}
+
+			op[ 0 ] = ( sum0 + sum1 ) + ( sum2 + sum3 ) +
+				flt[ 4 ] * ip[ 4 ] + flt[ 5 ] * ip[ 5 ];
+
+		#endif // defined( LANCIR_NEON )
+
+			LANCIR_LF_POST
+		}
+	}
+
+	template< bool UseSP >
+	static void resize2( const float* const sp, float* op, const size_t opinc,
+		const CResizePos* rp, const int kl, const int DstLen )
+	{
+	#if LANCIR_ALIGN > 4
+		const int ci = kl >> 2;
+		const int cir = kl & 3;
+	#else // LANCIR_ALIGN > 4
+		const int ci = kl >> 1;
+	#endif // LANCIR_ALIGN > 4
+
+		LANCIR_LF_PRE
+
+		int c = ci;
+
+	#if defined( LANCIR_AVX )
+
+		__m256 sum = _mm256_mul_ps( _mm256_load_ps( flt ),
+			_mm256_loadu_ps( ip ));
+
+		while( --c != 0 )
+		{
+			flt += 8;
+			ip += 8;
+			sum = _mm256_add_ps( sum, _mm256_mul_ps( _mm256_load_ps( flt ),
+				_mm256_loadu_ps( ip )));
+		}
+
+		__m128 res = _mm_add_ps( _mm256_extractf128_ps( sum, 0 ),
+			_mm256_extractf128_ps( sum, 1 ));
+
+		if( cir == 2 )
+		{
+			res = _mm_add_ps( res, _mm_mul_ps( _mm_load_ps( flt + 8 ),
+				_mm_loadu_ps( ip + 8 )));
+		}
+
+		res = _mm_add_ps( res, _mm_movehl_ps( res, res ));
+
+		_mm_store_ss( op, res );
+		_mm_store_ss( op + 1, _mm_shuffle_ps( res, res, 1 ));
+
+	#elif defined( LANCIR_SSE2 )
+
+		__m128 sumA = _mm_mul_ps( _mm_load_ps( flt ), _mm_loadu_ps( ip ));
+		__m128 sumB = _mm_mul_ps( _mm_load_ps( flt + 4 ),
+			_mm_loadu_ps( ip + 4 ));
+
+		while( --c != 0 )
+		{
+			flt += 8;
+			ip += 8;
+			sumA = _mm_add_ps( sumA, _mm_mul_ps( _mm_load_ps( flt ),
+				_mm_loadu_ps( ip )));
+
+			sumB = _mm_add_ps( sumB, _mm_mul_ps( _mm_load_ps( flt + 4 ),
+				_mm_loadu_ps( ip + 4 )));
+		}
+
+		sumA = _mm_add_ps( sumA, sumB );
+
+		if( cir == 2 )
+		{
+			sumA = _mm_add_ps( sumA, _mm_mul_ps( _mm_load_ps( flt + 8 ),
+				_mm_loadu_ps( ip + 8 )));
+		}
+
+		sumA = _mm_add_ps( sumA, _mm_movehl_ps( sumA, sumA ));
+
+		_mm_store_ss( op, sumA );
+		_mm_store_ss( op + 1, _mm_shuffle_ps( sumA, sumA, 1 ));
+
+	#elif defined( LANCIR_NEON )
+
+		float32x4_t sumA = vmulq_f32( vld1q_f32( flt ), vld1q_f32( ip ));
+		float32x4_t sumB = vmulq_f32( vld1q_f32( flt + 4 ),
+			vld1q_f32( ip + 4 ));
+
+		while( --c != 0 )
+		{
+			flt += 8;
+			ip += 8;
+			sumA = vmlaq_f32( sumA, vld1q_f32( flt ), vld1q_f32( ip ));
+			sumB = vmlaq_f32( sumB, vld1q_f32( flt + 4 ),
+				vld1q_f32( ip + 4 ));
+		}
+
+		sumA = vaddq_f32( sumA, sumB );
+
+		if( cir == 2 )
+		{
+			sumA = vmlaq_f32( sumA, vld1q_f32( flt + 8 ),
+				vld1q_f32( ip + 8 ));
+		}
+
+		vst1_f32( op, vadd_f32( vget_high_f32( sumA ), vget_low_f32( sumA )));
+
+	#else // defined( LANCIR_NEON )
+
+		const float xx = flt[ 0 ];
+		const float xx2 = flt[ 1 ];
+		float sum0 = xx * ip[ 0 ];
+		float sum1 = xx * ip[ 1 ];
+		float sum2 = xx2 * ip[ 2 ];
+		float sum3 = xx2 * ip[ 3 ];
+
+		while( --c != 0 )
+		{
+			flt += 2;
+			ip += 4;
+			const float xx = flt[ 0 ];
+			const float xx2 = flt[ 1 ];
+			sum0 += xx * ip[ 0 ];
+			sum1 += xx * ip[ 1 ];
+			sum2 += xx2 * ip[ 2 ];
+			sum3 += xx2 * ip[ 3 ];
+		}
+
+		op[ 0 ] = sum0 + sum2;
+		op[ 1 ] = sum1 + sum3;
+
+	#endif // defined( LANCIR_NEON )
+
+		LANCIR_LF_POST
+	}
+
+	template< bool UseSP >
+	static void resize3( const float* const sp, float* op, const size_t opinc,
+		const CResizePos* rp, const int kl, const int DstLen )
+	{
+	#if LANCIR_ALIGN > 4
+
+		const int ci = kl >> 2;
+		const int cir = kl & 3;
+
+		LANCIR_LF_PRE
+
+		float res[ 12 ];
+		int c = ci;
+
+		#if defined( LANCIR_AVX )
+
+		__m128 sumA = _mm_mul_ps( _mm_load_ps( flt ), _mm_loadu_ps( ip ));
+		__m256 sumB = _mm256_mul_ps( _mm256_loadu_ps( flt + 4 ),
+			_mm256_loadu_ps( ip + 4 ));
+
+		while( --c != 0 )
+		{
+			flt += 12;
+			ip += 12;
+			sumA = _mm_add_ps( sumA, _mm_mul_ps( _mm_load_ps( flt ),
+				_mm_loadu_ps( ip )));
+
+			sumB = _mm256_add_ps( sumB, _mm256_mul_ps(
+				_mm256_loadu_ps( flt + 4 ), _mm256_loadu_ps( ip + 4 )));
+		}
+
+		if( cir == 2 )
+		{
+			sumA = _mm_add_ps( sumA, _mm_mul_ps( _mm_load_ps( flt + 12 ),
+				_mm_loadu_ps( ip + 12 )));
+		}
+
+		_mm_storeu_ps( res, sumA );
+
+		float o0 = res[ 0 ] + res[ 3 ];
+		float o1 = res[ 1 ];
+		float o2 = res[ 2 ];
+
+		_mm256_storeu_ps( res + 4, sumB );
+
+		o1 += res[ 4 ];
+		o2 += res[ 5 ];
+
+		#elif defined( LANCIR_SSE2 )
+
+		__m128 sumA = _mm_mul_ps( _mm_load_ps( flt ), _mm_loadu_ps( ip ));
+		__m128 sumB = _mm_mul_ps( _mm_load_ps( flt + 4 ),
+			_mm_loadu_ps( ip + 4 ));
+
+		__m128 sumC = _mm_mul_ps( _mm_load_ps( flt + 8 ),
+			_mm_loadu_ps( ip + 8 ));
+
+		while( --c != 0 )
+		{
+			flt += 12;
+			ip += 12;
+			sumA = _mm_add_ps( sumA, _mm_mul_ps( _mm_load_ps( flt ),
+				_mm_loadu_ps( ip )));
+
+			sumB = _mm_add_ps( sumB, _mm_mul_ps( _mm_load_ps( flt + 4 ),
+				_mm_loadu_ps( ip + 4 )));
+
+			sumC = _mm_add_ps( sumC, _mm_mul_ps( _mm_load_ps( flt + 8 ),
+				_mm_loadu_ps( ip + 8 )));
+		}
+
+		if( cir == 2 )
+		{
+			sumA = _mm_add_ps( sumA, _mm_mul_ps( _mm_load_ps( flt + 12 ),
+				_mm_loadu_ps( ip + 12 )));
+		}
+
+		_mm_storeu_ps( res, sumA );
+		_mm_storeu_ps( res + 4, sumB );
+
+		float o0 = res[ 0 ] + res[ 3 ];
+		float o1 = res[ 1 ] + res[ 4 ];
+		float o2 = res[ 2 ] + res[ 5 ];
+
+		_mm_storeu_ps( res + 8, sumC );
+
+		#elif defined( LANCIR_NEON )
+
+		float32x4_t sumA = vmulq_f32( vld1q_f32( flt ), vld1q_f32( ip ));
+		float32x4_t sumB = vmulq_f32( vld1q_f32( flt + 4 ),
+			vld1q_f32( ip + 4 ));
+
+		float32x4_t sumC = vmulq_f32( vld1q_f32( flt + 8 ),
+			vld1q_f32( ip + 8 ));
+
+		while( --c != 0 )
+		{
+			flt += 12;
+			ip += 12;
+			sumA = vmlaq_f32( sumA, vld1q_f32( flt ), vld1q_f32( ip ));
+			sumB = vmlaq_f32( sumB, vld1q_f32( flt + 4 ),
+				vld1q_f32( ip + 4 ));
+
+			sumC = vmlaq_f32( sumC, vld1q_f32( flt + 8 ),
+				vld1q_f32( ip + 8 ));
+		}
+
+		if( cir == 2 )
+		{
+			sumA = vmlaq_f32( sumA, vld1q_f32( flt + 12 ),
+				vld1q_f32( ip + 12 ));
+		}
+
+		vst1q_f32( res, sumA );
+		vst1q_f32( res + 4, sumB );
+
+		float o0 = res[ 0 ] + res[ 3 ];
+		float o1 = res[ 1 ] + res[ 4 ];
+		float o2 = res[ 2 ] + res[ 5 ];
+
+		vst1q_f32( res + 8, sumC );
+
+		#endif // defined( LANCIR_NEON )
+
+		o0 += res[ 6 ] + res[ 9 ];
+		o1 += res[ 7 ] + res[ 10 ];
+		o2 += res[ 8 ] + res[ 11 ];
+
+		if( cir == 2 )
+		{
+			o1 += flt[ 16 ] * ip[ 16 ];
+			o2 += flt[ 17 ] * ip[ 17 ];
+		}
+
+		op[ 0 ] = o0;
+		op[ 1 ] = o1;
+		op[ 2 ] = o2;
+
+	#else // LANCIR_ALIGN > 4
+
+		const int ci = kl >> 1;
+
+		LANCIR_LF_PRE
+
+		int c = ci;
+
+		const float xx = flt[ 0 ];
+		float sum0 = xx * ip[ 0 ];
+		float sum1 = xx * ip[ 1 ];
+		float sum2 = xx * ip[ 2 ];
+		const float xx2 = flt[ 1 ];
+		float sum3 = xx2 * ip[ 3 ];
+		float sum4 = xx2 * ip[ 4 ];
+		float sum5 = xx2 * ip[ 5 ];
+
+		while( --c != 0 )
+		{
+			flt += 2;
+			ip += 6;
+			const float xx = flt[ 0 ];
+			sum0 += xx * ip[ 0 ];
+			sum1 += xx * ip[ 1 ];
+			sum2 += xx * ip[ 2 ];
+			const float xx2 = flt[ 1 ];
+			sum3 += xx2 * ip[ 3 ];
+			sum4 += xx2 * ip[ 4 ];
+			sum5 += xx2 * ip[ 5 ];
+		}
+
+		op[ 0 ] = sum0 + sum3;
+		op[ 1 ] = sum1 + sum4;
+		op[ 2 ] = sum2 + sum5;
+
+	#endif // LANCIR_ALIGN > 4
+
+		LANCIR_LF_POST
+	}
+
+	template< bool UseSP >
+	static void resize4( const float* const sp, float* op, const size_t opinc,
+		const CResizePos* rp, const int kl, const int DstLen )
+	{
+	#if LANCIR_ALIGN > 4
+		const int ci = kl >> 1;
+	#else // LANCIR_ALIGN > 4
+		const int ci = kl;
+	#endif // LANCIR_ALIGN > 4
+
+		LANCIR_LF_PRE
+
+		int c = ci;
+
+	#if defined( LANCIR_AVX )
+
+		__m256 sum = _mm256_mul_ps( _mm256_load_ps( flt ),
+			_mm256_loadu_ps( ip ));
+
+		while( --c != 0 )
+		{
+			flt += 8;
+			ip += 8;
+			sum = _mm256_add_ps( sum, _mm256_mul_ps( _mm256_load_ps( flt ),
+				_mm256_loadu_ps( ip )));
+		}
+
+		_mm_store_ps( op, _mm_add_ps( _mm256_extractf128_ps( sum, 0 ),
+			_mm256_extractf128_ps( sum, 1 )));
+
+	#elif defined( LANCIR_SSE2 )
+
+		__m128 sumA = _mm_mul_ps( _mm_load_ps( flt ), _mm_load_ps( ip ));
+		__m128 sumB = _mm_mul_ps( _mm_load_ps( flt + 4 ),
+			_mm_load_ps( ip + 4 ));
+
+		while( --c != 0 )
+		{
+			flt += 8;
+			ip += 8;
+			sumA = _mm_add_ps( sumA, _mm_mul_ps( _mm_load_ps( flt ),
+				_mm_load_ps( ip )));
+
+			sumB = _mm_add_ps( sumB, _mm_mul_ps( _mm_load_ps( flt + 4 ),
+				_mm_load_ps( ip + 4 )));
+		}
+
+		_mm_store_ps( op, _mm_add_ps( sumA, sumB ));
+
+	#elif defined( LANCIR_NEON )
+
+		float32x4_t sumA = vmulq_f32( vld1q_f32( flt ), vld1q_f32( ip ));
+		float32x4_t sumB = vmulq_f32( vld1q_f32( flt + 4 ),
+			vld1q_f32( ip + 4 ));
+
+		while( --c != 0 )
+		{
+			flt += 8;
+			ip += 8;
+			sumA = vmlaq_f32( sumA, vld1q_f32( flt ), vld1q_f32( ip ));
+			sumB = vmlaq_f32( sumB, vld1q_f32( flt + 4 ),
+				vld1q_f32( ip + 4 ));
+		}
+
+		vst1q_f32( op, vaddq_f32( sumA, sumB ));
+
+	#else // defined( LANCIR_NEON )
+
+		const float xx = flt[ 0 ];
+		float sum0 = xx * ip[ 0 ];
+		float sum1 = xx * ip[ 1 ];
+		float sum2 = xx * ip[ 2 ];
+		float sum3 = xx * ip[ 3 ];
+
+		while( --c != 0 )
+		{
+			flt++;
+			ip += 4;
+			const float xx = flt[ 0 ];
+			sum0 += xx * ip[ 0 ];
+			sum1 += xx * ip[ 1 ];
+			sum2 += xx * ip[ 2 ];
+			sum3 += xx * ip[ 3 ];
+		}
+
+		op[ 0 ] = sum0;
+		op[ 1 ] = sum1;
+		op[ 2 ] = sum2;
+		op[ 3 ] = sum3;
+
+	#endif // defined( LANCIR_NEON )
+
+		LANCIR_LF_POST
+	}
+
+	#undef LANCIR_LF_PRE
+	#undef LANCIR_LF_POST
+};
+
+#undef LANCIR_ALIGN
+
+} // namespace avir
+
+#endif // AVIR_CLANCIR_INCLUDED