diff --git a/src/Graphics/OpenGLContext/GLSL/glsl_CombinerProgramUniformFactoryAccurate.cpp b/src/Graphics/OpenGLContext/GLSL/glsl_CombinerProgramUniformFactoryAccurate.cpp index f5bf4a49e..31600ddc0 100644 --- a/src/Graphics/OpenGLContext/GLSL/glsl_CombinerProgramUniformFactoryAccurate.cpp +++ b/src/Graphics/OpenGLContext/GLSL/glsl_CombinerProgramUniformFactoryAccurate.cpp @@ -111,7 +111,7 @@ class UMipmap : public UniformGroup maxTile = std::min(gSP.texture.level, 1u); // Hack for HD textures uMaxTile.set(maxTile, _force); - bool bNoAtlasTex = (_pTexture != nullptr && _pTexture->bHDTexture) || + bool bNoAtlasTex = (_pTexture != nullptr && _pTexture->bHDTexture && _pTexture->max_level == 0) || maxTile == 0 || gDP.otherMode.textureLOD != G_TL_LOD || (gDP.otherMode.textureDetail != G_TD_DETAIL && maxTile == 1); diff --git a/src/GraphicsDrawer.h b/src/GraphicsDrawer.h index 5f0c307c5..cd1d24ff6 100644 --- a/src/GraphicsDrawer.h +++ b/src/GraphicsDrawer.h @@ -21,7 +21,7 @@ struct FrameBuffer; #define ELEMBUFF_SIZE 1024U constexpr f32 SCREEN_SIZE_DIM = 640.0f; -constexpr u32 MIPMAP_TILE_WIDTH = 256u; +constexpr u32 MIPMAP_TILE_WIDTH = 512u; enum class DrawingState { diff --git a/src/Textures.cpp b/src/Textures.cpp index 625b21e94..9b308be38 100644 --- a/src/Textures.cpp +++ b/src/Textures.cpp @@ -3,6 +3,7 @@ #include #include // std::this_thread::sleep_for #include // std::chrono::seconds +#include #include "Platform.h" #include "Textures.h" #include "GBI.h" @@ -1151,6 +1152,80 @@ void TextureCache::_loadBackground(CachedTexture *pTexture) free(pDest); } +void _loadHiresTextureMipMapAccurate(CachedTexture *_pTexture, GHQTexInfo const& _ghqTexInfo, u64 _ricecrc) +{ + u32 texWidth = _ghqTexInfo.width; + u32 texHeight = _ghqTexInfo.height; + unsigned int totalTexSize = std::max(static_cast(texWidth * texHeight + 16), MIPMAP_TILE_WIDTH) + * (_pTexture->max_level + 1); + + std::vector m_tempTextureHolder(totalTexSize); + std::vector tileData(texWidth * texHeight / 4); + u32* pTileData = reinterpret_cast(_ghqTexInfo.data); + + u32 mipLevel = 0; + u32 texDataOffset = 16; // number of gDP.tiles * 2 + avir::CLancIR imageResizer; + + // Load all tiles into one 1D texture atlas. + u32 mipRatioS = gDP.tiles[gSP.texture.tile + 1].shifts + 5u; + if (mipRatioS >= 16u) mipRatioS -= 16u; + u32 mipRatioT = gDP.tiles[gSP.texture.tile + 1].shiftt + 5u; + if (mipRatioT >= 16) mipRatioT -= 16u; + while (true) + { + const u32 tileSizePacked = texWidth | (mipRatioT << 16) | (mipRatioS << 24); + m_tempTextureHolder[mipLevel * 2] = texDataOffset; + m_tempTextureHolder[mipLevel * 2 + 1] = tileSizePacked; + + txfilter_dmptx((u8*)pTileData, texWidth, texHeight, + texWidth, (u16)_ghqTexInfo.format, + N64FormatSize(_pTexture->format, _pTexture->size), + _ricecrc + mipLevel); + + std::copy_n(pTileData, texWidth * texHeight, &m_tempTextureHolder[texDataOffset]); + pTileData = &m_tempTextureHolder[texDataOffset]; + texDataOffset += texWidth * texHeight; + if (mipLevel == _pTexture->max_level) + break; + + ++mipLevel; + u32 mipRatioSNew = gDP.tiles[gSP.texture.tile + mipLevel + 1].shifts + 5u; + if (mipRatioSNew >= 16u) mipRatioSNew -= 16u; + u32 mipRatioTNew = gDP.tiles[gSP.texture.tile + mipLevel + 1].shiftt + 5u; + if (mipRatioTNew >= 16u) mipRatioTNew -= 16u; + u32 shifts = mipRatioSNew - mipRatioS; + u32 shiftt = mipRatioTNew - mipRatioT; + if (shifts > 0 || shiftt > 0) { + imageResizer.resizeImage((u8*)pTileData, texWidth, texHeight, (u8*)tileData.data(), texWidth >> shifts, texHeight >> shiftt, 4); + texWidth >>= shifts; + texHeight >>= shiftt; + mipRatioS = mipRatioSNew; + mipRatioT = mipRatioTNew; + pTileData = tileData.data(); + } + } + + u32 texformat = gfxContext.convertInternalTextureFormat(_ghqTexInfo.format); + Context::InitTextureParams params; + params.handle = _pTexture->name; + params.textureUnitIndex = textureIndices::Tex[1]; + params.mipMapLevel = 0; + params.mipMapLevels = 1; + params.msaaLevel = 0; + params.width = std::min(texDataOffset, MIPMAP_TILE_WIDTH); + params.height = (texDataOffset / MIPMAP_TILE_WIDTH) + ((texDataOffset % MIPMAP_TILE_WIDTH) ? 1 : 0); + params.internalFormat = InternalColorFormatParam(texformat); + params.format = ColorFormatParam(_ghqTexInfo.texture_format); + params.dataType = DatatypeParam(_ghqTexInfo.pixel_type); + params.data = m_tempTextureHolder.data(); + gfxContext.init2DTexture(params); + assert(!gfxContext.isError()); + _pTexture->mipmapAtlasWidth = params.width; + _pTexture->mipmapAtlasHeight = params.height; + _pTexture->textureBytes = texDataOffset << 2; +} + bool TextureCache::_loadHiresTexture(u32 _tile, CachedTexture *_pTexture, u64 & _ricecrc, u64 & _strongcrc) { if (config.textureFilter.txHiresEnable == 0 || !TFH.isInited()) @@ -1233,6 +1308,21 @@ bool TextureCache::_loadHiresTexture(u32 _tile, CachedTexture *_pTexture, u64 & hirestexFound = txfilter_hirestex(_pTexture->crc, _strongcrc, palette, N64FormatSize(_pTexture->format, _pTexture->size), &ghqTexInfo); } if (hirestexFound && ghqTexInfo.width != 0 && ghqTexInfo.height != 0) { + if (config.generalEmulation.enableInaccurateTextureCoordinates == 0 && + _tile > 0 && + currentCombiner()->usesLOD() && + gSP.texture.level > 1) { + _pTexture->max_level = gDP.otherMode.textureDetail == G_TD_DETAIL ? + static_cast(gSP.texture.level) : + static_cast(gSP.texture.level - 1); + } + if (_pTexture->max_level > 0) + { + _loadHiresTextureMipMapAccurate(_pTexture, ghqTexInfo, _ricecrc); + _updateCachedTexture(ghqTexInfo, _pTexture, width, height); + return true; + } + ghqTexInfo.format = gfxContext.convertInternalTextureFormat(ghqTexInfo.format); Context::InitTextureParams params; params.handle = _pTexture->name; @@ -1604,6 +1694,12 @@ void TextureCache::_loadFast(u32 _tile, CachedTexture *_pTexture) void TextureCache::_loadAccurate(u32 _tile, CachedTexture *_pTexture) { + gDPTile * pTile = _tile < 2 ? gSP.textureTile[_tile] : &gDP.tiles[_tile]; + const u32 tMemMask = gDP.otherMode.textureLUT == G_TT_NONE ? 0x1FF : 0xFF; + gDPLoadTileInfo &info = gDP.loadInfo[pTile->tmem & tMemMask]; + if (info.texAddress == 0x0071a0f0 || info.texAddress == 0x00719ef0) + int t = 0; + u64 ricecrc = 0; u64 strongcrc = 0; if (_loadHiresTexture(_tile, _pTexture, ricecrc, strongcrc)) @@ -1666,7 +1762,6 @@ void TextureCache::_loadAccurate(u32 _tile, CachedTexture *_pTexture) // Load all tiles into one 1D texture atlas. while (true) { - u32 mipRatioS = gDP.tiles[gSP.texture.tile + mipLevel + 1].shifts + 5u; if (mipRatioS >= 16u) mipRatioS -= 16u; u32 mipRatioT = gDP.tiles[gSP.texture.tile + mipLevel + 1].shiftt + 5u; @@ -1699,7 +1794,6 @@ void TextureCache::_loadAccurate(u32 _tile, CachedTexture *_pTexture) ++mipLevel; const u32 tileMipLevel = gSP.texture.tile + mipLevel + 1; gDPTile & mipTile = gDP.tiles[tileMipLevel]; - gDPTile & prevMipTile = gDP.tiles[tileMipLevel - 1]; line = mipTile.line; tmptex.tMem = mipTile.tmem; tmptex.palette = mipTile.palette; diff --git a/src/inc/lancir.h b/src/inc/lancir.h new file mode 100644 index 000000000..12e5c8207 --- /dev/null +++ b/src/inc/lancir.h @@ -0,0 +1,2446 @@ +//$ nobt +//$ nocpp + +/** + * @file lancir.h + * + * @version 3.0.11 + * + * @brief The self-contained header-only "LANCIR" image resizing algorithm. + * + * This is the self-contained inclusion file for the "LANCIR" image resizer, + * a part of the AVIR library. Features scalar, AVX, SSE2, and NEON + * optimizations as well as batched resizing technique which provides a better + * CPU cache performance. + * + * AVIR Copyright (c) 2015-2024 Aleksey Vaneev + * + * @mainpage + * + * @section intro_sec Introduction + * + * Description is available at https://github.com/avaneev/avir + * + * @section license License + * + * LICENSE: + * + * Copyright (c) 2015-2024 Aleksey Vaneev + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef AVIR_CLANCIR_INCLUDED +#define AVIR_CLANCIR_INCLUDED + +#include +#include +#include + +#if defined( __AVX__ ) || defined( __AVX2__ ) + + #include + + #define LANCIR_AVX + #define LANCIR_SSE2 // Some functions use SSE2; AVX has a higher priority. + #define LANCIR_ALIGN 32 + +#elif defined( __SSE4_2__ ) || defined( __SSE4_1__ ) || \ + defined( __SSSE3__ ) || defined( __SSE3__ ) || defined( __SSE2__ ) || \ + defined( __x86_64__ ) || defined( __amd64 ) || defined( _M_X64 ) || \ + defined( _M_AMD64 ) || ( defined( _M_IX86_FP ) && _M_IX86_FP == 2 ) + + #if defined( _MSC_VER ) + #include + #else // defined( _MSC_VER ) + #include + #endif // defined( _MSC_VER ) + + #define LANCIR_SSE2 + #define LANCIR_ALIGN 16 + +#elif defined( __aarch64__ ) || defined( __arm64 ) || defined( __ARM_NEON ) + + #include + + #define LANCIR_NEON + #define LANCIR_ALIGN 16 + +#else // NEON + + #define LANCIR_ALIGN 4 + +#endif // NEON + +namespace avir { + +/** + * @brief LANCIR resizing parameters class. + * + * An object of this class, which can be allocated on stack, can be used to + * pass non-default parameters to the resizing algorithm. See the constructor + * for the default values. + */ + +class CLancIRParams +{ +public: + int SrcSSize; ///< Physical size of the source scanline, in elements (not + ///< bytes). If this value is below 1, SrcWidth * ElCount will be + ///< used. + int NewSSize; ///< Physical size of the destination scanline, in elements + ///< (not bytes). If this value is below 1, NewWidth * ElCount will be + ///< used. + double kx; ///< Resizing step - horizontal (one output pixel corresponds + ///< to `k` input pixels). A downsizing factor if greater than 1.0; + ///< upsizing factor if below or equal to 1.0. Multiply by -1 if you + ///< would like to bypass `ox` and `oy` adjustment which is done by + ///< default to produce a centered image. If this step value equals 0, + ///< the step value will be chosen automatically. + double ky; ///< Resizing step - vertical. Same as `kx`. + double ox; ///< Start X pixel offset within the source image, can be + ///< negative. A positive offset moves the image to the left. + ///< + double oy; ///< Start Y pixel offset within the source image, can be + ///< negative. A positive offset moves the image to the top. + ///< + double la; ///< Lanczos window function's `a` parameter, greater or equal + ///< to 2.0. + ///< + + /** + * Default constructor, with optional arguments that correspond to class + * variables. + * + * @param aSrcSSize Physical size of the source scanline. + * @param aNewSSize Physical size of the destination scanline. + * @param akx Resizing step - horizontal. + * @param aky Resizing step - vertical. + * @param aox Start X pixel offset. + * @param aoy Start Y pixel offset. + */ + + CLancIRParams( const int aSrcSSize = 0, const int aNewSSize = 0, + const double akx = 0.0, const double aky = 0.0, + const double aox = 0.0, const double aoy = 0.0 ) + : SrcSSize( aSrcSSize ) + , NewSSize( aNewSSize ) + , kx( akx ) + , ky( aky ) + , ox( aox ) + , oy( aoy ) + , la( 3.0 ) + { + } +}; + +/** + * @brief LANCIR image resizer class. + * + * The object of this class can be used to resize 1-4 channel images to any + * required size. Resizing is performed by utilizing Lanczos filters, with + * 8-bit precision. This class offers a kind of "optimal" Lanczos resampling + * implementation. + * + * Object of this class can be allocated on stack. + * + * Note that object of this class does not free temporary buffers and + * variables after the resizeImage() function call (until object's + * destruction): these buffers are reused (or reallocated) on subsequent + * calls, thus making batch resizing of images faster. This means resizing is + * not thread-safe: a separate object should be created for each thread. + */ + +class CLancIR +{ +private: + CLancIR( const CLancIR& ) + { + // Unsupported. + } + + CLancIR& operator = ( const CLancIR& ) + { + // Unsupported. + return( *this ); + } + +public: + CLancIR() + : FltBuf0( NULL ) + , FltBuf0Len( 0 ) + , spv0( NULL ) + , spv0len( 0 ) + , spv( NULL ) + { + } + + ~CLancIR() + { + delete[] FltBuf0; + delete[] spv0; + } + + /** + * @brief Function resizes an image. + * + * Performs input-to-output type conversion, if necessary. + * + * @param[in] SrcBuf Source image buffer. + * @param SrcWidth Source image width, in pixels. + * @param SrcHeight Source image height, in pixels. + * @param[out] NewBuf Buffer to accept the resized image. Cannot be equal + * to SrcBuf. + * @param NewWidth New image width, in pixels. + * @param NewHeight New image height, in pixels. + * @param ElCount The number of elements (channels) used to store each + * source and destination pixel (1-4). + * @tparam Tin Input buffer's element type. Can be uint8_t (0-255 value + * range), uint16_t (0-65535 value range), float (0-1 value range), double + * (0-1 value range). uint32_t type is treated as uint16_t. Signed integer + * types and larger integer types are unsupported. + * @tparam Tout Output buffer's element type, treated like `Tin`. If `Tin` + * and `Tout` types do not match, an output value scaling will be applied. + * Floating-point output will not clamped/clipped/saturated, integer + * output is always rounded and clamped. + * @return The number of available output scanlines. Equals to NewHeight, + * or 0 on function parameters error. + */ + + template< typename Tin, typename Tout > + int resizeImage( const Tin* const SrcBuf, const int SrcWidth, + const int SrcHeight, Tout* const NewBuf, const int NewWidth, + const int NewHeight, const int ElCount, + const CLancIRParams* const aParams = NULL ) + { + if(( SrcWidth < 0 ) | ( SrcHeight < 0 ) | ( NewWidth <= 0 ) | + ( NewHeight <= 0 ) | ( SrcBuf == NULL ) | ( NewBuf == NULL ) | + ( (const void*) SrcBuf == (const void*) NewBuf )) + { + return( 0 ); + } + + static const CLancIRParams DefParams; + const CLancIRParams& Params = ( aParams != NULL ? + *aParams : DefParams ); + + if( Params.la < 2.0 ) + { + return( 0 ); + } + + const int OutSSize = NewWidth * ElCount; + const size_t NewScanlineSize = ( Params.NewSSize < 1 ? + OutSSize : Params.NewSSize ); + + if(( SrcWidth == 0 ) | ( SrcHeight == 0 )) + { + Tout* op = NewBuf; + int i; + + for( i = 0; i < NewHeight; i++ ) + { + memset( op, 0, OutSSize * sizeof( Tout )); + op += NewScanlineSize; + } + + return( NewHeight ); + } + + const size_t SrcScanlineSize = ( Params.SrcSSize < 1 ? + SrcWidth * ElCount : Params.SrcSSize ); + + double ox = Params.ox; + double oy = Params.oy; + double kx; + double ky; + + if( Params.kx >= 0.0 ) + { + kx = ( Params.kx == 0.0 ? + (double) SrcWidth / NewWidth : Params.kx ); + + ox += ( kx - 1.0 ) * 0.5; + } + else + { + kx = -Params.kx; + } + + if( Params.ky >= 0.0 ) + { + ky = ( Params.ky == 0.0 ? + (double) SrcHeight / NewHeight : Params.ky ); + + oy += ( ky - 1.0 ) * 0.5; + } + else + { + ky = -Params.ky; + } + + if( rfv.update( Params.la, ky, ElCount )) + { + rsv.reset(); + rsh.reset(); + } + + CResizeFilters* rfh; // Pointer to resizing filters for horizontal + // resizing, may equal to `rfv` if the same stepping is in use. + + if( kx == ky ) + { + rfh = &rfv; + } + else + { + rfh = &rfh0; + + if( rfh0.update( Params.la, kx, ElCount )) + { + rsh.reset(); + } + } + + rsv.update( SrcHeight, NewHeight, oy, rfv, spv ); + rsh.update( SrcWidth, NewWidth, ox, *rfh ); + + // Calculate vertical progressive resizing's batch size. Progressive + // batching is used to try to keep addressing within the cache + // capacity. This technique definitely works well for single-threaded + // resizing on most CPUs, but may not provide an additional benefit + // for multi-threaded resizing, or in a system-wide high-load + // situations. + + const size_t FltWidthE = ( rsh.padl + SrcWidth + rsh.padr ) * ElCount; + const double CacheSize = 5500000.0; // Tuned for various CPUs. + const double OpSize = (double) SrcScanlineSize * SrcHeight * + sizeof( Tin ) + (double) FltWidthE * NewHeight * sizeof( float ); + + int BatchSize = (int) ( NewHeight * CacheSize / ( OpSize + 1.0 )); + + if( BatchSize < 8 ) + { + BatchSize = 8; + } + + if( BatchSize > NewHeight ) + { + BatchSize = NewHeight; + } + + // Allocate/resize intermediate buffers. + + const int svs = ( rsv.padl + SrcHeight + rsv.padr ) * ElCount; + float* const pspv0 = spv0; + reallocBuf( spv0, spv, spv0len, ( svs > OutSSize ? svs : OutSSize )); + reallocBuf( FltBuf0, FltBuf, FltBuf0Len, FltWidthE * BatchSize ); + + if( spv0 != pspv0 ) + { + rsv.updateSPO( rfv, spv ); + } + + // Prepare output-related constants. + + const bool IsOutFloat = ( (Tout) 0.25 != 0 ); + const int Clamp = ( sizeof( Tout ) == 1 ? 255 : 65535 ); + const float OutMul = ( IsOutFloat ? 1.0f : (float) Clamp ) / + ( (Tin) 0.25 != 0 ? 1 : ( sizeof( Tin ) == 1 ? 255 : 65535 )); + + // Perform batched resizing. + + const CResizePos* rpv = rsv.pos; + Tout* opn = NewBuf; + int bl = NewHeight; + + while( bl > 0 ) + { + const int bc = ( bl > BatchSize ? BatchSize : bl ); + + int kl = rfv.KernelLen; + const Tin* ip = SrcBuf; + float* op = FltBuf + rsh.padl * ElCount; + + const int so = (int) rpv[ 0 ].so; + float* const sp = spv + so * ElCount; + + int cc = (int) rpv[ bc - 1 ].so - so + kl; // Pixel copy count. + int rl = 0; // Leftmost pixel's replication count. + int rr = 0; // Rightmost pixel's replication count. + + const int socc = so + cc; + const int spe = rsv.padl + SrcHeight; + + // Calculate scanline copying and padding parameters, depending on + // the batch's size and its vertical offset. + + if( so < rsv.padl ) + { + if( socc <= rsv.padl ) + { + rl = cc; + cc = 0; + } + else + { + if( socc > spe ) + { + rr = socc - spe; + cc -= rr; + } + + rl = rsv.padl - so; + cc -= rl; + } + } + else + { + if( so >= spe ) + { + rr = cc; + cc = 0; + ip += SrcHeight * SrcScanlineSize; + } + else + { + if( socc > spe ) + { + rr = socc - spe; + cc -= rr; + } + + ip += ( so - rsv.padl ) * SrcScanlineSize; + } + } + + // Batched vertical resizing. + + int i; + + if( ElCount == 1 ) + { + for( i = 0; i < SrcWidth; i++ ) + { + copyScanline1v( ip, SrcScanlineSize, sp, cc, rl, rr ); + resize1< false >( NULL, op, FltWidthE, rpv, kl, bc ); + ip += 1; + op += 1; + } + } + else + if( ElCount == 2 ) + { + for( i = 0; i < SrcWidth; i++ ) + { + copyScanline2v( ip, SrcScanlineSize, sp, cc, rl, rr ); + resize2< false >( NULL, op, FltWidthE, rpv, kl, bc ); + ip += 2; + op += 2; + } + } + else + if( ElCount == 3 ) + { + for( i = 0; i < SrcWidth; i++ ) + { + copyScanline3v( ip, SrcScanlineSize, sp, cc, rl, rr ); + resize3< false >( NULL, op, FltWidthE, rpv, kl, bc ); + ip += 3; + op += 3; + } + } + else // ElCount == 4 + { + for( i = 0; i < SrcWidth; i++ ) + { + copyScanline4v( ip, SrcScanlineSize, sp, cc, rl, rr ); + resize4< false >( NULL, op, FltWidthE, rpv, kl, bc ); + ip += 4; + op += 4; + } + } + + // Perform horizontal resizing batch, and produce final output. + + float* ipf = FltBuf; + kl = rfh -> KernelLen; + + if( ElCount == 1 ) + { + for( i = 0; i < bc; i++ ) + { + padScanline1h( ipf, rsh, SrcWidth ); + resize1< true >( ipf, spv, 1, rsh.pos, kl, NewWidth ); + copyToOutput( spv, opn, OutSSize, Clamp, IsOutFloat, + OutMul ); + + ipf += FltWidthE; + opn += NewScanlineSize; + } + } + else + if( ElCount == 2 ) + { + for( i = 0; i < bc; i++ ) + { + padScanline2h( ipf, rsh, SrcWidth ); + resize2< true >( ipf, spv, 2, rsh.pos, kl, NewWidth ); + copyToOutput( spv, opn, OutSSize, Clamp, IsOutFloat, + OutMul ); + + ipf += FltWidthE; + opn += NewScanlineSize; + } + } + else + if( ElCount == 3 ) + { + for( i = 0; i < bc; i++ ) + { + padScanline3h( ipf, rsh, SrcWidth ); + resize3< true >( ipf, spv, 3, rsh.pos, kl, NewWidth ); + copyToOutput( spv, opn, OutSSize, Clamp, IsOutFloat, + OutMul ); + + ipf += FltWidthE; + opn += NewScanlineSize; + } + } + else // ElCount == 4 + { + for( i = 0; i < bc; i++ ) + { + padScanline4h( ipf, rsh, SrcWidth ); + resize4< true >( ipf, spv, 4, rsh.pos, kl, NewWidth ); + copyToOutput( spv, opn, OutSSize, Clamp, IsOutFloat, + OutMul ); + + ipf += FltWidthE; + opn += NewScanlineSize; + } + } + + rpv += bc; + bl -= bc; + } + + return( NewHeight ); + } + + /** + * @brief Legacy resizing function. Not recommended for new projects. + * + * See the prior resizeImage() function and CLancIRParams class for + * details. + * + * @param[in] SrcBuf Source image buffer. + * @param SrcWidth Source image width, in pixels. + * @param SrcHeight Source image height, in pixels. + * @param SrcSSize Physical size of the source scanline, in elements (not + * bytes). + * @param[out] NewBuf Buffer to accept the resized image. Cannot be equal + * to SrcBuf. + * @param NewWidth New image width, in pixels. + * @param NewHeight New image height, in pixels. + * @param NewSSize Physical size of the destination scanline, in elements + * (not bytes). + * @param ElCount The number of elements (channels) used to store each + * source and destination pixel (1-4). + * @param kx0 Resizing step - horizontal. + * @param ky0 Resizing step - vertical. Same as `kx0`. + * @param ox Start X pixel offset within the source image. + * @param oy Start Y pixel offset within the source image. + * @tparam Tin Input buffer's element type. + * @tparam Tout Output buffer's element type. + * @return The number of available output scanlines. Equals to NewHeight, + * or 0 on function parameters error. + */ + + template< typename Tin, typename Tout > + int resizeImage( const Tin* const SrcBuf, const int SrcWidth, + const int SrcHeight, const int SrcSSize, Tout* const NewBuf, + const int NewWidth, const int NewHeight, const int NewSSize, + const int ElCount, const double kx0 = 0.0, const double ky0 = 0.0, + double ox = 0.0, double oy = 0.0 ) + { + const CLancIRParams Params( SrcSSize, NewSSize, kx0, ky0, ox, oy ); + + return( resizeImage( SrcBuf, SrcWidth, SrcHeight, NewBuf, NewWidth, + NewHeight, ElCount, &Params )); + } + +protected: + float* FltBuf0; ///< Intermediate resizing buffer. + size_t FltBuf0Len; ///< Length of `FltBuf0`. + float* FltBuf; ///< Address-aligned `FltBuf0`. + float* spv0; ///< Scanline buffer for vertical resizing, also used at the + ///< output stage. + ///< + int spv0len; ///< Length of `spv0`. + float* spv; ///< Address-aligned `spv0`. + + /** + * Function reallocates a typed buffer if its current length is smaller + * than the required length. + * + * @param buf0 Reference to the pointer of the previously allocated + * buffer. + * @param buf Reference to address-aligned `buf0` pointer. + * @param len The current length of the `buf0`. + * @param newlen A new required length. + * @tparam Tb Buffer element type. + * @tparam Tl Length variable type. + */ + + template< typename Tb, typename Tl > + static void reallocBuf( Tb*& buf0, Tb*& buf, Tl& len, Tl newlen ) + { + newlen += LANCIR_ALIGN; + + if( newlen > len ) + { + if( buf0 != NULL ) + { + delete[] buf0; + buf0 = NULL; + len = 0; + } + + buf0 = new Tb[ newlen ]; + len = newlen; + buf = (Tb*) (( (uintptr_t) buf0 + LANCIR_ALIGN - 1 ) & + ~(uintptr_t) ( LANCIR_ALIGN - 1 )); + } + } + + /** + * Function reallocates a typed buffer if its current length is smaller + * than the required length. + * + * @param buf Reference to the pointer of the previously allocated buffer; + * address alignment will not be applied. + * @param len The current length of the `buf0`. + * @param newlen A new required length. + * @tparam Tb Buffer element type. + * @tparam Tl Length variable type. + */ + + template< typename Tb, typename Tl > + static void reallocBuf( Tb*& buf, Tl& len, const Tl newlen ) + { + if( newlen > len ) + { + if( buf != NULL ) + { + delete[] buf; + buf = NULL; + len = 0; + } + + buf = new Tb[ newlen ]; + len = newlen; + } + } + + class CResizeScanline; + + /** + * Class implements fractional delay filter bank calculation. + */ + + class CResizeFilters + { + friend class CResizeScanline; + + public: + int KernelLen; ///< Resampling filter kernel's length, taps. Available + ///< after the update() function call. Always an even value, + ///< should not be lesser than 4. + + CResizeFilters() + : Filters( NULL ) + , FiltersLen( 0 ) + , la( 0.0 ) + { + memset( Bufs0, 0, sizeof( Bufs0 )); + memset( Bufs0Len, 0, sizeof( Bufs0Len )); + } + + ~CResizeFilters() + { + int i; + + for( i = 0; i < BufCount; i++ ) + { + delete[] Bufs0[ i ]; + } + + delete[] Filters; + } + + /** + * Function updates the resizing filter bank. + * + * @param la0 Lanczos `a` parameter value (greater or equal to 2.0), + * can be fractional. + * @param k0 Resizing step. + * @param ElCount0 Image's element count, may be used for SIMD filter + * tap replication. + * @return `true` if an update occured and scanline resizing positions + * should be updated unconditionally. + */ + + bool update( const double la0, const double k0, const int ElCount0 ) + { + if( la0 == la && k0 == k && ElCount0 == ElCount ) + { + return( false ); + } + + const double NormFreq = ( k0 <= 1.0 ? 1.0 : 1.0 / k0 ); + Freq = 3.1415926535897932 * NormFreq; + FreqA = Freq / la0; + + Len2 = la0 / NormFreq; + fl2 = (int) ceil( Len2 ); + KernelLen = fl2 + fl2; + + #if LANCIR_ALIGN > 4 + + ElRepl = ElCount0; + KernelLenA = KernelLen * ElRepl; + + const int elalign = + (int) ( LANCIR_ALIGN / sizeof( float )) - 1; + + KernelLenA = ( KernelLenA + elalign ) & ~elalign; + + #else // LANCIR_ALIGN > 4 + + ElRepl = 1; + KernelLenA = KernelLen; + + #endif // LANCIR_ALIGN > 4 + + FracCount = 1000; // Enough for Lanczos implicit 8-bit precision. + + la = 0.0; + reallocBuf( Filters, FiltersLen, FracCount + 1 ); + + memset( Filters, 0, FiltersLen * sizeof( Filters[ 0 ])); + + setBuf( 0 ); + + la = la0; + k = k0; + ElCount = ElCount0; + + return( true ); + } + + /** + * Function returns filter at the specified fractional offset. This + * function can only be called after a prior update() function call. + * + * @param x Fractional offset, [0; 1]. + */ + + const float* getFilter( const double x ) + { + const int Frac = (int) ( x * FracCount + 0.5 ); + float* flt = Filters[ Frac ]; + + if( flt != NULL ) + { + return( flt ); + } + + flt = Bufs[ CurBuf ] + CurBufFill * KernelLenA; + Filters[ Frac ] = flt; + CurBufFill++; + + if( CurBufFill == BufLen ) + { + setBuf( CurBuf + 1 ); + } + + makeFilterNorm( flt, 1.0 - (double) Frac / FracCount ); + + if( ElRepl > 1 ) + { + replicateFilter( flt, KernelLen, ElRepl ); + } + + return( flt ); + } + + protected: + double Freq; ///< Circular frequency of the filter. + double FreqA; ///< Circular frequency of the window function. + double Len2; ///< Half resampling filter's length, unrounded. + int fl2; ///< Half resampling filter's length, integer. + int FracCount; ///< The number of fractional positions for which + ///< filters can be created. + ///< + int KernelLenA; ///< SIMD-aligned and replicated filter kernel's + ///< length. + ///< + int ElRepl; ///< The number of repetitions of each filter tap. + static const int BufCount = 4; ///< The maximal number of buffers that + ///< can be in use. + ///< + static const int BufLen = 256; ///< The number of fractional filters + ///< a single buffer may contain. Both `BufLen` and `BufCount` + ///< should correspond to the `FracCount` used. + float* Bufs0[ BufCount ]; ///< Buffers that hold all filters, + ///< original. + ///< + int Bufs0Len[ BufCount ]; ///< Allocated lengthes in `Bufs0`, in + ///< `float` elements. + ///< + float* Bufs[ BufCount ]; ///< Address-aligned `Bufs0`. + int CurBuf; ///< Filter buffer currently being filled. + int CurBufFill; ///< The number of fractional positions filled in the + ///< current filter buffer. + ///< + float** Filters; ///< Fractional delay filters for all positions. + ///< A particular pointer equals NULL if a filter for such + ///< position has not been created yet. + int FiltersLen; ///< Allocated length of Filters, in elements. + double la; ///< Current `la`. + double k; ///< Current `k`. + int ElCount; ///< Current `ElCount`. + + /** + * Function changes the buffer currently being filled, check its + * size and reallocates it if necessary, then resets its fill counter. + * + * @param bi New current buffer index. + */ + + void setBuf( const int bi ) + { + reallocBuf( Bufs0[ bi ], Bufs[ bi ], Bufs0Len[ bi ], + BufLen * KernelLenA ); + + CurBuf = bi; + CurBufFill = 0; + } + + /** + * @brief Sine-wave signal generator class. + * + * Class implements sine-wave signal generator without biasing, with + * constructor-based initialization only. This generator uses an + * oscillator instead of the `sin` function. + */ + + class CSineGen + { + public: + /** + * Constructor initializes `this` sine-wave signal generator. + * + * @param si Sine function increment, in radians. + * @param ph Starting phase, in radians. Add `0.5 x PI` for a + * cosine function. + */ + + CSineGen( const double si, const double ph ) + : svalue1( sin( ph )) + , svalue2( sin( ph - si )) + , sincr( 2.0 * cos( si )) + { + } + + /** + * @return The next value of the sine-wave, without biasing. + */ + + double generate() + { + const double res = svalue1; + + svalue1 = sincr * res - svalue2; + svalue2 = res; + + return( res ); + } + + private: + double svalue1; ///< Current sine value. + double svalue2; ///< Previous sine value. + double sincr; ///< Sine value increment. + }; + + /** + * Function creates a filter for the specified fractional delay. The + * update() function should be called prior to calling this function. + * The created filter is normalized (DC gain=1). + * + * @param[out] op Output filter buffer. + * @param FracDelay Fractional delay, 0 to 1, inclusive. + */ + + void makeFilterNorm( float* op, const double FracDelay ) const + { + CSineGen f( Freq, Freq * ( FracDelay - fl2 )); + CSineGen fw( FreqA, FreqA * ( FracDelay - fl2 )); + + float* op0 = op; + double s = 0.0; + double ut; + + int t = -fl2; + + if( t + FracDelay < -Len2 ) + { + f.generate(); + fw.generate(); + *op = (float) 0; + op++; + t++; + } + + int IsZeroX = ( fabs( FracDelay - 1.0 ) < 2.3e-13 ); + int mt = 0 - IsZeroX; + IsZeroX |= ( fabs( FracDelay ) < 2.3e-13 ); + + while( t < mt ) + { + ut = t + FracDelay; + *op = (float) ( f.generate() * fw.generate() / ( ut * ut )); + s += *op; + op++; + t++; + } + + if( IsZeroX ) // t+FracDelay==0 + { + *op = (float) ( Freq * FreqA ); + s += *op; + f.generate(); + fw.generate(); + } + else + { + ut = FracDelay; // t==0 + *op = (float) ( f.generate() * fw.generate() / ( ut * ut )); + s += *op; + } + + mt = fl2 - 2; + + while( t < mt ) + { + op++; + t++; + ut = t + FracDelay; + *op = (float) ( f.generate() * fw.generate() / ( ut * ut )); + s += *op; + } + + op++; + ut = t + 1 + FracDelay; + + if( ut > Len2 ) + { + *op = (float) 0; + } + else + { + *op = (float) ( f.generate() * fw.generate() / ( ut * ut )); + s += *op; + } + + s = 1.0 / s; + t = (int) ( op - op0 + 1 ); + + while( t != 0 ) + { + *op0 = (float) ( *op0 * s ); + op0++; + t--; + } + } + + /** + * Function replicates taps of the specified filter so that it can + * be used with SIMD loading instructions. This function works + * "in-place". + * + * @param[in,out] p Filter buffer pointer, should be sized to contain + * `kl * erp` elements. + * @param kl Filter kernel's length, in taps. + * @param erp The number of repetitions to apply. + */ + + static void replicateFilter( float* const p, const int kl, + const int erp ) + { + const float* ip = p + kl - 1; + float* op = p + ( kl - 1 ) * erp; + int c = kl; + + if( erp == 2 ) + { + while( c != 0 ) + { + const float v = *ip; + op[ 0 ] = v; + op[ 1 ] = v; + ip--; + op -= 2; + c--; + } + } + else + if( erp == 3 ) + { + while( c != 0 ) + { + const float v = *ip; + op[ 0 ] = v; + op[ 1 ] = v; + op[ 2 ] = v; + ip--; + op -= 3; + c--; + } + } + else // erp == 4 + { + while( c != 0 ) + { + const float v = *ip; + op[ 0 ] = v; + op[ 1 ] = v; + op[ 2 ] = v; + op[ 3 ] = v; + ip--; + op -= 4; + c--; + } + } + } + }; + + /** + * Structure defines source scanline positions and filters for each + * destination pixel. + */ + + struct CResizePos + { + const float* flt; ///< Fractional delay filter. + intptr_t spo; ///< Source scanline's pixel offset, in bytes, or + ///< a direct pointer to scanline buffer. + ///< + intptr_t so; ///< Offset within the source scanline, in pixels. + }; + + /** + * Class contains resizing positions, and prepares source scanline + * positions for resize filtering. The public variables become available + * after the update() function call. + */ + + class CResizeScanline + { + public: + int padl; ///< Left-padding (in pixels) required for source scanline. + int padr; ///< Right-padding (in pixels) required for source scanline. + CResizePos* pos; ///< Source scanline positions (offsets) and filters + ///< for each destination pixel position. + ///< + + CResizeScanline() + : pos( NULL ) + , poslen( 0 ) + , SrcLen( 0 ) + { + } + + ~CResizeScanline() + { + delete[] pos; + } + + /** + * Function "resets" `this` object so that the next update() call + * fully updates the position buffer. Reset is necessary if the filter + * object was updated. + */ + + void reset() + { + SrcLen = 0; + } + + /** + * Function updates resizing positions, updates `padl`, `padr`, and + * `pos` buffer. + * + * @param SrcLen0 Source image scanline length, used to create a + * scanline buffer without length pre-calculation. + * @param DstLen0 Destination image scanline length. + * @param o0 Initial source image offset. + * @param rf Resizing filters object. + * @param sp A pointer to scanline buffer, to use for absolute + * scanline positioning, can be NULL. + */ + + void update( const int SrcLen0, const int DstLen0, const double o0, + CResizeFilters& rf, float* const sp = NULL ) + { + if( SrcLen0 == SrcLen && DstLen0 == DstLen && o0 == o ) + { + return; + } + + const int fl2m1 = rf.fl2 - 1; + padl = fl2m1 - (int) floor( o0 ); + + if( padl < 0 ) + { + padl = 0; + } + + // Make sure `padr` and `pos` are in sync: calculate ending `pos` + // offset in advance. + + const double k = rf.k; + + const int DstLen_m1 = DstLen0 - 1; + const double oe = o0 + k * DstLen_m1; + const int ie = (int) floor( oe ); + + padr = ie + rf.fl2 + 1 - SrcLen0; + + if( padr < 0 ) + { + padr = 0; + } + + SrcLen = 0; + reallocBuf( pos, poslen, DstLen0 ); + + const intptr_t ElCountF = rf.ElCount * sizeof( float ); + const int so = padl - fl2m1; + CResizePos* rp = pos; + intptr_t rpso; + int i; + + for( i = 0; i < DstLen_m1; i++ ) + { + const double ox = o0 + k * i; + const int ix = (int) floor( ox ); + + rp -> flt = rf.getFilter( ox - ix ); + rpso = so + ix; + rp -> spo = (intptr_t) sp + rpso * ElCountF; + rp -> so = rpso; + rp++; + } + + rp -> flt = rf.getFilter( oe - ie ); + rpso = so + ie; + rp -> spo = (intptr_t) sp + rpso * ElCountF; + rp -> so = rpso; + + SrcLen = SrcLen0; + DstLen = DstLen0; + o = o0; + } + + /** + * Function updates `pos` buffer's `spo` values. + * + * @param rf Resizing filters object. + * @param sp A pointer to scanline buffer, to use for absolute + * scanline positioning, can be NULL. + */ + + void updateSPO( CResizeFilters& rf, float* const sp ) + { + const intptr_t ElCountF = rf.ElCount * sizeof( float ); + CResizePos* const rp = pos; + int i; + + for( i = 0; i < DstLen; i++ ) + { + rp[ i ].spo = (intptr_t) sp + rp[ i ].so * ElCountF; + } + } + + protected: + int poslen; ///< Allocated `pos` buffer's length. + int SrcLen; ///< Current `SrcLen`. + int DstLen; ///< Current `DstLen`. + double o; ///< Current `o`. + }; + + CResizeFilters rfv; ///< Resizing filters for vertical resizing. + CResizeFilters rfh0; ///< Resizing filters for horizontal resizing (may + ///< not be in use). + ///< + CResizeScanline rsv; ///< Vertical resize scanline. + CResizeScanline rsh; ///< Horizontal resize scanline. + + /** + * Function copies scanline (fully or partially) from the source buffer, + * in its native format, to the internal scanline buffer, in preparation + * for vertical resizing. Variants for 1-4-channel images. + * + * @param ip Source scanline buffer pointer. + * @param ipinc `ip` increment per pixel. + * @param op Output scanline pointer. + * @param cc Source pixel copy count. + * @param repl Leftmost pixel's replication count. + * @param repr Rightmost pixel's replication count. + * @tparam T Source buffer's element type. + */ + + template< typename T > + static void copyScanline1v( const T* ip, const size_t ipinc, float* op, + int cc, int repl, int repr ) + { + float v0; + + if( repl > 0 ) + { + v0 = (float) ip[ 0 ]; + + do + { + op[ 0 ] = v0; + op += 1; + + } while( --repl != 0 ); + } + + while( cc != 0 ) + { + op[ 0 ] = (float) ip[ 0 ]; + ip += ipinc; + op += 1; + cc--; + } + + if( repr > 0 ) + { + const T* const ipe = ip - ipinc; + v0 = (float) ipe[ 0 ]; + + do + { + op[ 0 ] = v0; + op += 1; + + } while( --repr != 0 ); + } + } + + template< typename T > + static void copyScanline2v( const T* ip, const size_t ipinc, float* op, + int cc, int repl, int repr ) + { + float v0, v1; + + if( repl > 0 ) + { + v0 = (float) ip[ 0 ]; + v1 = (float) ip[ 1 ]; + + do + { + op[ 0 ] = v0; + op[ 1 ] = v1; + op += 2; + + } while( --repl != 0 ); + } + + while( cc != 0 ) + { + op[ 0 ] = (float) ip[ 0 ]; + op[ 1 ] = (float) ip[ 1 ]; + ip += ipinc; + op += 2; + cc--; + } + + if( repr > 0 ) + { + const T* const ipe = ip - ipinc; + v0 = (float) ipe[ 0 ]; + v1 = (float) ipe[ 1 ]; + + do + { + op[ 0 ] = v0; + op[ 1 ] = v1; + op += 2; + + } while( --repr != 0 ); + } + } + + template< typename T > + static void copyScanline3v( const T* ip, const size_t ipinc, float* op, + int cc, int repl, int repr ) + { + float v0, v1, v2; + + if( repl > 0 ) + { + v0 = (float) ip[ 0 ]; + v1 = (float) ip[ 1 ]; + v2 = (float) ip[ 2 ]; + + do + { + op[ 0 ] = v0; + op[ 1 ] = v1; + op[ 2 ] = v2; + op += 3; + + } while( --repl != 0 ); + } + + while( cc != 0 ) + { + op[ 0 ] = (float) ip[ 0 ]; + op[ 1 ] = (float) ip[ 1 ]; + op[ 2 ] = (float) ip[ 2 ]; + ip += ipinc; + op += 3; + cc--; + } + + if( repr > 0 ) + { + const T* const ipe = ip - ipinc; + v0 = (float) ipe[ 0 ]; + v1 = (float) ipe[ 1 ]; + v2 = (float) ipe[ 2 ]; + + do + { + op[ 0 ] = v0; + op[ 1 ] = v1; + op[ 2 ] = v2; + op += 3; + + } while( --repr != 0 ); + } + } + + template< typename T > + static void copyScanline4v( const T* ip, const size_t ipinc, float* op, + int cc, int repl, int repr ) + { + float v0, v1, v2, v3; + + if( repl > 0 ) + { + v0 = (float) ip[ 0 ]; + v1 = (float) ip[ 1 ]; + v2 = (float) ip[ 2 ]; + v3 = (float) ip[ 3 ]; + + do + { + op[ 0 ] = v0; + op[ 1 ] = v1; + op[ 2 ] = v2; + op[ 3 ] = v3; + op += 4; + + } while( --repl != 0 ); + } + + while( cc != 0 ) + { + op[ 0 ] = (float) ip[ 0 ]; + op[ 1 ] = (float) ip[ 1 ]; + op[ 2 ] = (float) ip[ 2 ]; + op[ 3 ] = (float) ip[ 3 ]; + ip += ipinc; + op += 4; + cc--; + } + + if( repr > 0 ) + { + const T* const ipe = ip - ipinc; + v0 = (float) ipe[ 0 ]; + v1 = (float) ipe[ 1 ]; + v2 = (float) ipe[ 2 ]; + v3 = (float) ipe[ 3 ]; + + do + { + op[ 0 ] = v0; + op[ 1 ] = v1; + op[ 2 ] = v2; + op[ 3 ] = v3; + op += 4; + + } while( --repr != 0 ); + } + } + + /** + * Function pads the specified scanline buffer to the left and right by + * replicating its first and last available pixels, in preparation for + * horizontal resizing. Variants for 1-4-channel images. + * + * @param[in,out] op Scanline buffer to pad. + * @param rs Scanline resizing positions object. + * @param l Source scanline's length, in pixels. + */ + + static void padScanline1h( float* op, CResizeScanline& rs, const int l ) + { + const float* ip = op + rs.padl; + + float v0 = ip[ 0 ]; + int i; + + for( i = 0; i < rs.padl; i++ ) + { + op[ i ] = v0; + } + + ip += l; + op += rs.padl + l; + + v0 = ip[ -1 ]; + + for( i = 0; i < rs.padr; i++ ) + { + op[ i ] = v0; + } + } + + static void padScanline2h( float* op, CResizeScanline& rs, const int l ) + { + const float* ip = op + rs.padl * 2; + + float v0 = ip[ 0 ]; + float v1 = ip[ 1 ]; + int i; + + for( i = 0; i < rs.padl; i++ ) + { + op[ 0 ] = v0; + op[ 1 ] = v1; + op += 2; + } + + const int lc = l * 2; + ip += lc; + op += lc; + + v0 = ip[ -2 ]; + v1 = ip[ -1 ]; + + for( i = 0; i < rs.padr; i++ ) + { + op[ 0 ] = v0; + op[ 1 ] = v1; + op += 2; + } + } + + static void padScanline3h( float* op, CResizeScanline& rs, const int l ) + { + const float* ip = op + rs.padl * 3; + + float v0 = ip[ 0 ]; + float v1 = ip[ 1 ]; + float v2 = ip[ 2 ]; + int i; + + for( i = 0; i < rs.padl; i++ ) + { + op[ 0 ] = v0; + op[ 1 ] = v1; + op[ 2 ] = v2; + op += 3; + } + + const int lc = l * 3; + ip += lc; + op += lc; + + v0 = ip[ -3 ]; + v1 = ip[ -2 ]; + v2 = ip[ -1 ]; + + for( i = 0; i < rs.padr; i++ ) + { + op[ 0 ] = v0; + op[ 1 ] = v1; + op[ 2 ] = v2; + op += 3; + } + } + + static void padScanline4h( float* op, CResizeScanline& rs, const int l ) + { + const float* ip = op + rs.padl * 4; + + float v0 = ip[ 0 ]; + float v1 = ip[ 1 ]; + float v2 = ip[ 2 ]; + float v3 = ip[ 3 ]; + int i; + + for( i = 0; i < rs.padl; i++ ) + { + op[ 0 ] = v0; + op[ 1 ] = v1; + op[ 2 ] = v2; + op[ 3 ] = v3; + op += 4; + } + + const int lc = l * 4; + ip += lc; + op += lc; + + v0 = ip[ -4 ]; + v1 = ip[ -3 ]; + v2 = ip[ -2 ]; + v3 = ip[ -1 ]; + + for( i = 0; i < rs.padr; i++ ) + { + op[ 0 ] = v0; + op[ 1 ] = v1; + op[ 2 ] = v2; + op[ 3 ] = v3; + op += 4; + } + } + + /** + * Function rounds a value and applies clamping. + * + * @param v Value to round and clamp. + * @param Clamp High clamp level; low level is 0. + */ + + static inline int roundclamp( const float v, const int Clamp ) + { + if( v < 0.5f ) + { + return( 0 ); + } + + const int vr = (int) ( v + 0.5f ); + + return( vr > Clamp ? Clamp : vr ); + } + + /** + * Function performs final output of the resized scanline pixels to the + * destination image buffer. + * + * @param[in] ip Input resized scanline. + * @param[out] op Output image buffer. + * @param l Output scanline's size (not pixel count). + * @param Clamp Clamp high level, used if `IsOutFloat` is `false`. + * @param IsOutFloat `true` if floating-point output, and no clamping is + * necessary. + * @param OutMul Output multiplier, for value range conversion. + * @tparam T Output buffer's element type. + */ + + template< typename T > + static void copyToOutput( const float* ip, T* op, int l, const int Clamp, + const bool IsOutFloat, const float OutMul ) + { + const bool IsUnityMul = ( OutMul == 1.0f ); + + if( IsOutFloat ) + { + if( IsUnityMul ) + { + if( sizeof( op[ 0 ]) == sizeof( ip[ 0 ])) + { + memcpy( op, ip, l * sizeof( op[ 0 ])); + return; + } + else + { + int l4 = l >> 2; + l &= 3; + + while( l4 != 0 ) + { + op[ 0 ] = (T) ip[ 0 ]; + op[ 1 ] = (T) ip[ 1 ]; + op[ 2 ] = (T) ip[ 2 ]; + op[ 3 ] = (T) ip[ 3 ]; + ip += 4; + op += 4; + l4--; + } + + while( l != 0 ) + { + *op = (T) *ip; + ip++; + op++; + l--; + } + + return; + } + } + + int l4 = l >> 2; + l &= 3; + bool DoScalar = true; + + if( sizeof( op[ 0 ]) == sizeof( ip[ 0 ])) + { + #if defined( LANCIR_SSE2 ) + + DoScalar = false; + const __m128 om = _mm_set1_ps( OutMul ); + + while( l4 != 0 ) + { + _mm_storeu_ps( (float*) op, + _mm_mul_ps( _mm_load_ps( ip ), om )); + + ip += 4; + op += 4; + l4--; + } + + #elif defined( LANCIR_NEON ) + + DoScalar = false; + const float32x4_t om = vdupq_n_f32( OutMul ); + + while( l4 != 0 ) + { + vst1q_f32( (float*) op, + vmulq_f32( vld1q_f32( ip ), om )); + + ip += 4; + op += 4; + l4--; + } + + #endif // defined( LANCIR_NEON ) + } + + if( DoScalar ) + { + while( l4 != 0 ) + { + op[ 0 ] = (T) ( ip[ 0 ] * OutMul ); + op[ 1 ] = (T) ( ip[ 1 ] * OutMul ); + op[ 2 ] = (T) ( ip[ 2 ] * OutMul ); + op[ 3 ] = (T) ( ip[ 3 ] * OutMul ); + ip += 4; + op += 4; + l4--; + } + } + + while( l != 0 ) + { + *op = (T) ( *ip * OutMul ); + ip++; + op++; + l--; + } + + return; + } + + int l4 = l >> 2; + l &= 3; + + #if defined( LANCIR_SSE2 ) + + const __m128 minv = _mm_setzero_ps(); + const __m128 maxv = _mm_set1_ps( (float) Clamp ); + const __m128 om = _mm_set1_ps( OutMul ); + + unsigned int prevrm = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE( _MM_ROUND_NEAREST ); + + if( sizeof( op[ 0 ]) == 4 ) + { + while( l4 != 0 ) + { + const __m128 cv = _mm_max_ps( _mm_min_ps( + _mm_mul_ps( _mm_load_ps( ip ), om ), maxv ), minv ); + + _mm_storeu_si128( (__m128i*) op, _mm_cvtps_epi32( cv )); + + ip += 4; + op += 4; + l4--; + } + } + else + if( sizeof( op[ 0 ]) == 2 ) + { + while( l4 != 0 ) + { + const __m128 cv = _mm_max_ps( _mm_min_ps( + _mm_mul_ps( _mm_load_ps( ip ), om ), maxv ), minv ); + + const __m128i v32 = _mm_cvtps_epi32( cv ); + const __m128i v16s = _mm_shufflehi_epi16( + _mm_shufflelo_epi16( v32, 0 | 2 << 2 ), 0 | 2 << 2 ); + + const __m128i v16 = _mm_shuffle_epi32( v16s, 0 | 2 << 2 ); + + uint64_t tmp[ 2 ]; + _mm_storeu_si128( (__m128i*) tmp, v16 ); + *(uint64_t*) op = tmp[ 0 ]; + + ip += 4; + op += 4; + l4--; + } + } + else + { + while( l4 != 0 ) + { + const __m128 cv = _mm_max_ps( _mm_min_ps( + _mm_mul_ps( _mm_load_ps( ip ), om ), maxv ), minv ); + + const __m128i v32 = _mm_cvtps_epi32( cv ); + const __m128i v16s = _mm_shufflehi_epi16( + _mm_shufflelo_epi16( v32, 0 | 2 << 2 ), 0 | 2 << 2 ); + + const __m128i v16 = _mm_shuffle_epi32( v16s, 0 | 2 << 2 ); + const __m128i v8 = _mm_packus_epi16( v16, v16 ); + + *(uint32_t*) op = (uint32_t) _mm_cvtsi128_si32( v8 ); + + ip += 4; + op += 4; + l4--; + } + } + + _MM_SET_ROUNDING_MODE( prevrm ); + + #elif defined( LANCIR_NEON ) + + const float32x4_t minv = vdupq_n_f32( 0.0f ); + const float32x4_t maxv = vdupq_n_f32( (float) Clamp ); + const float32x4_t om = vdupq_n_f32( OutMul ); + const float32x4_t v05 = vdupq_n_f32( 0.5f ); + + if( sizeof( op[ 0 ]) == 4 ) + { + while( l4 != 0 ) + { + const float32x4_t cv = vmaxq_f32( vminq_f32( + vmulq_f32( vld1q_f32( ip ), om ), maxv ), minv ); + + vst1q_u32( (uint32_t*) op, vcvtq_u32_f32( vaddq_f32( + cv, v05 ))); + + ip += 4; + op += 4; + l4--; + } + } + else + if( sizeof( op[ 0 ]) == 2 ) + { + while( l4 != 0 ) + { + const float32x4_t cv = vmaxq_f32( vminq_f32( + vmulq_f32( vld1q_f32( ip ), om ), maxv ), minv ); + + const uint32x4_t v32 = vcvtq_u32_f32( vaddq_f32( cv, v05 )); + const uint16x4_t v16 = vmovn_u32( v32 ); + + vst1_u16( (uint16_t*) op, v16 ); + + ip += 4; + op += 4; + l4--; + } + } + else + { + while( l4 != 0 ) + { + const float32x4_t cv = vmaxq_f32( vminq_f32( + vmulq_f32( vld1q_f32( ip ), om ), maxv ), minv ); + + const uint32x4_t v32 = vcvtq_u32_f32( vaddq_f32( cv, v05 )); + const uint16x4_t v16 = vmovn_u32( v32 ); + const uint8x8_t v8 = vmovn_u16( vcombine_u16( v16, v16 )); + + *(uint32_t*) op = vget_lane_u32( (uint32x2_t) v8, 0 ); + + ip += 4; + op += 4; + l4--; + } + } + + #else // defined( LANCIR_NEON ) + + if( IsUnityMul ) + { + while( l4 != 0 ) + { + op[ 0 ] = (T) roundclamp( ip[ 0 ], Clamp ); + op[ 1 ] = (T) roundclamp( ip[ 1 ], Clamp ); + op[ 2 ] = (T) roundclamp( ip[ 2 ], Clamp ); + op[ 3 ] = (T) roundclamp( ip[ 3 ], Clamp ); + ip += 4; + op += 4; + l4--; + } + } + else + { + while( l4 != 0 ) + { + op[ 0 ] = (T) roundclamp( ip[ 0 ] * OutMul, Clamp ); + op[ 1 ] = (T) roundclamp( ip[ 1 ] * OutMul, Clamp ); + op[ 2 ] = (T) roundclamp( ip[ 2 ] * OutMul, Clamp ); + op[ 3 ] = (T) roundclamp( ip[ 3 ] * OutMul, Clamp ); + ip += 4; + op += 4; + l4--; + } + } + + #endif // defined( LANCIR_NEON ) + + if( IsUnityMul ) + { + while( l != 0 ) + { + *op = (T) roundclamp( *ip, Clamp ); + ip++; + op++; + l--; + } + } + else + { + while( l != 0 ) + { + *op = (T) roundclamp( *ip * OutMul, Clamp ); + ip++; + op++; + l--; + } + } + } + + #define LANCIR_LF_PRE \ + const CResizePos* const rpe = rp + DstLen; \ + while( rp != rpe ) \ + { \ + const float* flt = rp -> flt; \ + const float* ip; \ + if( UseSP ) \ + { \ + ip = (float*) ( (intptr_t) sp + rp -> spo ); \ + } \ + else \ + { \ + ip = (float*) rp -> spo; \ + } + + #define LANCIR_LF_POST \ + op += opinc; \ + rp++; \ + } + + /** + * Function performs scanline resizing. Variants for 1-4-channel images. + * + * @param[in] sp Source scanline buffer. + * @param[out] op Destination buffer. + * @param opinc `op` increment. + * @param rp Source scanline offsets and resizing filters. + * @param kl Filter kernel's length, in taps (always an even value). + * @param DstLen Destination length, in pixels. + * @tparam UseSP `true` if `sp` pointer should be added to `spo`. + */ + + template< bool UseSP > + static void resize1( const float* const sp, float* op, const size_t opinc, + const CResizePos* rp, const int kl, const int DstLen ) + { + const int ci = kl >> 2; + + if(( kl & 3 ) == 0 ) + { + LANCIR_LF_PRE + + int c = ci; + + #if defined( LANCIR_SSE2 ) + + __m128 sum = _mm_mul_ps( _mm_load_ps( flt ), _mm_loadu_ps( ip )); + + while( --c != 0 ) + { + flt += 4; + ip += 4; + sum = _mm_add_ps( sum, _mm_mul_ps( _mm_load_ps( flt ), + _mm_loadu_ps( ip ))); + } + + sum = _mm_add_ps( sum, _mm_movehl_ps( sum, sum )); + + _mm_store_ss( op, _mm_add_ss( sum, + _mm_shuffle_ps( sum, sum, 1 ))); + + #elif defined( LANCIR_NEON ) + + float32x4_t sum = vmulq_f32( vld1q_f32( flt ), vld1q_f32( ip )); + + while( --c != 0 ) + { + flt += 4; + ip += 4; + sum = vmlaq_f32( sum, vld1q_f32( flt ), vld1q_f32( ip )); + } + + op[ 0 ] = vaddvq_f32( sum ); + + #else // defined( LANCIR_NEON ) + + float sum0 = flt[ 0 ] * ip[ 0 ]; + float sum1 = flt[ 1 ] * ip[ 1 ]; + float sum2 = flt[ 2 ] * ip[ 2 ]; + float sum3 = flt[ 3 ] * ip[ 3 ]; + + while( --c != 0 ) + { + flt += 4; + ip += 4; + sum0 += flt[ 0 ] * ip[ 0 ]; + sum1 += flt[ 1 ] * ip[ 1 ]; + sum2 += flt[ 2 ] * ip[ 2 ]; + sum3 += flt[ 3 ] * ip[ 3 ]; + } + + op[ 0 ] = ( sum0 + sum1 ) + ( sum2 + sum3 ); + + #endif // defined( LANCIR_NEON ) + + LANCIR_LF_POST + } + else + { + LANCIR_LF_PRE + + int c = ci; + + #if defined( LANCIR_SSE2 ) + + __m128 sum = _mm_mul_ps( _mm_load_ps( flt ), _mm_loadu_ps( ip )); + + while( --c != 0 ) + { + flt += 4; + ip += 4; + sum = _mm_add_ps( sum, _mm_mul_ps( _mm_load_ps( flt ), + _mm_loadu_ps( ip ))); + } + + sum = _mm_add_ps( sum, _mm_movehl_ps( sum, sum )); + + const __m128 sum2 = _mm_mul_ps( _mm_loadu_ps( flt + 2 ), + _mm_loadu_ps( ip + 2 )); + + sum = _mm_add_ps( sum, _mm_movehl_ps( sum2, sum2 )); + + _mm_store_ss( op, _mm_add_ss( sum, + _mm_shuffle_ps( sum, sum, 1 ))); + + #elif defined( LANCIR_NEON ) + + float32x4_t sum = vmulq_f32( vld1q_f32( flt ), vld1q_f32( ip )); + + while( --c != 0 ) + { + flt += 4; + ip += 4; + sum = vmlaq_f32( sum, vld1q_f32( flt ), vld1q_f32( ip )); + } + + const float32x2_t sum2 = vadd_f32( vget_high_f32( sum ), + vget_low_f32( sum )); + + op[ 0 ] = vaddv_f32( vmla_f32( sum2, vld1_f32( flt + 4 ), + vld1_f32( ip + 4 ))); + + #else // defined( LANCIR_NEON ) + + float sum0 = flt[ 0 ] * ip[ 0 ]; + float sum1 = flt[ 1 ] * ip[ 1 ]; + float sum2 = flt[ 2 ] * ip[ 2 ]; + float sum3 = flt[ 3 ] * ip[ 3 ]; + + while( --c != 0 ) + { + flt += 4; + ip += 4; + sum0 += flt[ 0 ] * ip[ 0 ]; + sum1 += flt[ 1 ] * ip[ 1 ]; + sum2 += flt[ 2 ] * ip[ 2 ]; + sum3 += flt[ 3 ] * ip[ 3 ]; + } + + op[ 0 ] = ( sum0 + sum1 ) + ( sum2 + sum3 ) + + flt[ 4 ] * ip[ 4 ] + flt[ 5 ] * ip[ 5 ]; + + #endif // defined( LANCIR_NEON ) + + LANCIR_LF_POST + } + } + + template< bool UseSP > + static void resize2( const float* const sp, float* op, const size_t opinc, + const CResizePos* rp, const int kl, const int DstLen ) + { + #if LANCIR_ALIGN > 4 + const int ci = kl >> 2; + const int cir = kl & 3; + #else // LANCIR_ALIGN > 4 + const int ci = kl >> 1; + #endif // LANCIR_ALIGN > 4 + + LANCIR_LF_PRE + + int c = ci; + + #if defined( LANCIR_AVX ) + + __m256 sum = _mm256_mul_ps( _mm256_load_ps( flt ), + _mm256_loadu_ps( ip )); + + while( --c != 0 ) + { + flt += 8; + ip += 8; + sum = _mm256_add_ps( sum, _mm256_mul_ps( _mm256_load_ps( flt ), + _mm256_loadu_ps( ip ))); + } + + __m128 res = _mm_add_ps( _mm256_extractf128_ps( sum, 0 ), + _mm256_extractf128_ps( sum, 1 )); + + if( cir == 2 ) + { + res = _mm_add_ps( res, _mm_mul_ps( _mm_load_ps( flt + 8 ), + _mm_loadu_ps( ip + 8 ))); + } + + res = _mm_add_ps( res, _mm_movehl_ps( res, res )); + + _mm_store_ss( op, res ); + _mm_store_ss( op + 1, _mm_shuffle_ps( res, res, 1 )); + + #elif defined( LANCIR_SSE2 ) + + __m128 sumA = _mm_mul_ps( _mm_load_ps( flt ), _mm_loadu_ps( ip )); + __m128 sumB = _mm_mul_ps( _mm_load_ps( flt + 4 ), + _mm_loadu_ps( ip + 4 )); + + while( --c != 0 ) + { + flt += 8; + ip += 8; + sumA = _mm_add_ps( sumA, _mm_mul_ps( _mm_load_ps( flt ), + _mm_loadu_ps( ip ))); + + sumB = _mm_add_ps( sumB, _mm_mul_ps( _mm_load_ps( flt + 4 ), + _mm_loadu_ps( ip + 4 ))); + } + + sumA = _mm_add_ps( sumA, sumB ); + + if( cir == 2 ) + { + sumA = _mm_add_ps( sumA, _mm_mul_ps( _mm_load_ps( flt + 8 ), + _mm_loadu_ps( ip + 8 ))); + } + + sumA = _mm_add_ps( sumA, _mm_movehl_ps( sumA, sumA )); + + _mm_store_ss( op, sumA ); + _mm_store_ss( op + 1, _mm_shuffle_ps( sumA, sumA, 1 )); + + #elif defined( LANCIR_NEON ) + + float32x4_t sumA = vmulq_f32( vld1q_f32( flt ), vld1q_f32( ip )); + float32x4_t sumB = vmulq_f32( vld1q_f32( flt + 4 ), + vld1q_f32( ip + 4 )); + + while( --c != 0 ) + { + flt += 8; + ip += 8; + sumA = vmlaq_f32( sumA, vld1q_f32( flt ), vld1q_f32( ip )); + sumB = vmlaq_f32( sumB, vld1q_f32( flt + 4 ), + vld1q_f32( ip + 4 )); + } + + sumA = vaddq_f32( sumA, sumB ); + + if( cir == 2 ) + { + sumA = vmlaq_f32( sumA, vld1q_f32( flt + 8 ), + vld1q_f32( ip + 8 )); + } + + vst1_f32( op, vadd_f32( vget_high_f32( sumA ), vget_low_f32( sumA ))); + + #else // defined( LANCIR_NEON ) + + const float xx = flt[ 0 ]; + const float xx2 = flt[ 1 ]; + float sum0 = xx * ip[ 0 ]; + float sum1 = xx * ip[ 1 ]; + float sum2 = xx2 * ip[ 2 ]; + float sum3 = xx2 * ip[ 3 ]; + + while( --c != 0 ) + { + flt += 2; + ip += 4; + const float xx = flt[ 0 ]; + const float xx2 = flt[ 1 ]; + sum0 += xx * ip[ 0 ]; + sum1 += xx * ip[ 1 ]; + sum2 += xx2 * ip[ 2 ]; + sum3 += xx2 * ip[ 3 ]; + } + + op[ 0 ] = sum0 + sum2; + op[ 1 ] = sum1 + sum3; + + #endif // defined( LANCIR_NEON ) + + LANCIR_LF_POST + } + + template< bool UseSP > + static void resize3( const float* const sp, float* op, const size_t opinc, + const CResizePos* rp, const int kl, const int DstLen ) + { + #if LANCIR_ALIGN > 4 + + const int ci = kl >> 2; + const int cir = kl & 3; + + LANCIR_LF_PRE + + float res[ 12 ]; + int c = ci; + + #if defined( LANCIR_AVX ) + + __m128 sumA = _mm_mul_ps( _mm_load_ps( flt ), _mm_loadu_ps( ip )); + __m256 sumB = _mm256_mul_ps( _mm256_loadu_ps( flt + 4 ), + _mm256_loadu_ps( ip + 4 )); + + while( --c != 0 ) + { + flt += 12; + ip += 12; + sumA = _mm_add_ps( sumA, _mm_mul_ps( _mm_load_ps( flt ), + _mm_loadu_ps( ip ))); + + sumB = _mm256_add_ps( sumB, _mm256_mul_ps( + _mm256_loadu_ps( flt + 4 ), _mm256_loadu_ps( ip + 4 ))); + } + + if( cir == 2 ) + { + sumA = _mm_add_ps( sumA, _mm_mul_ps( _mm_load_ps( flt + 12 ), + _mm_loadu_ps( ip + 12 ))); + } + + _mm_storeu_ps( res, sumA ); + + float o0 = res[ 0 ] + res[ 3 ]; + float o1 = res[ 1 ]; + float o2 = res[ 2 ]; + + _mm256_storeu_ps( res + 4, sumB ); + + o1 += res[ 4 ]; + o2 += res[ 5 ]; + + #elif defined( LANCIR_SSE2 ) + + __m128 sumA = _mm_mul_ps( _mm_load_ps( flt ), _mm_loadu_ps( ip )); + __m128 sumB = _mm_mul_ps( _mm_load_ps( flt + 4 ), + _mm_loadu_ps( ip + 4 )); + + __m128 sumC = _mm_mul_ps( _mm_load_ps( flt + 8 ), + _mm_loadu_ps( ip + 8 )); + + while( --c != 0 ) + { + flt += 12; + ip += 12; + sumA = _mm_add_ps( sumA, _mm_mul_ps( _mm_load_ps( flt ), + _mm_loadu_ps( ip ))); + + sumB = _mm_add_ps( sumB, _mm_mul_ps( _mm_load_ps( flt + 4 ), + _mm_loadu_ps( ip + 4 ))); + + sumC = _mm_add_ps( sumC, _mm_mul_ps( _mm_load_ps( flt + 8 ), + _mm_loadu_ps( ip + 8 ))); + } + + if( cir == 2 ) + { + sumA = _mm_add_ps( sumA, _mm_mul_ps( _mm_load_ps( flt + 12 ), + _mm_loadu_ps( ip + 12 ))); + } + + _mm_storeu_ps( res, sumA ); + _mm_storeu_ps( res + 4, sumB ); + + float o0 = res[ 0 ] + res[ 3 ]; + float o1 = res[ 1 ] + res[ 4 ]; + float o2 = res[ 2 ] + res[ 5 ]; + + _mm_storeu_ps( res + 8, sumC ); + + #elif defined( LANCIR_NEON ) + + float32x4_t sumA = vmulq_f32( vld1q_f32( flt ), vld1q_f32( ip )); + float32x4_t sumB = vmulq_f32( vld1q_f32( flt + 4 ), + vld1q_f32( ip + 4 )); + + float32x4_t sumC = vmulq_f32( vld1q_f32( flt + 8 ), + vld1q_f32( ip + 8 )); + + while( --c != 0 ) + { + flt += 12; + ip += 12; + sumA = vmlaq_f32( sumA, vld1q_f32( flt ), vld1q_f32( ip )); + sumB = vmlaq_f32( sumB, vld1q_f32( flt + 4 ), + vld1q_f32( ip + 4 )); + + sumC = vmlaq_f32( sumC, vld1q_f32( flt + 8 ), + vld1q_f32( ip + 8 )); + } + + if( cir == 2 ) + { + sumA = vmlaq_f32( sumA, vld1q_f32( flt + 12 ), + vld1q_f32( ip + 12 )); + } + + vst1q_f32( res, sumA ); + vst1q_f32( res + 4, sumB ); + + float o0 = res[ 0 ] + res[ 3 ]; + float o1 = res[ 1 ] + res[ 4 ]; + float o2 = res[ 2 ] + res[ 5 ]; + + vst1q_f32( res + 8, sumC ); + + #endif // defined( LANCIR_NEON ) + + o0 += res[ 6 ] + res[ 9 ]; + o1 += res[ 7 ] + res[ 10 ]; + o2 += res[ 8 ] + res[ 11 ]; + + if( cir == 2 ) + { + o1 += flt[ 16 ] * ip[ 16 ]; + o2 += flt[ 17 ] * ip[ 17 ]; + } + + op[ 0 ] = o0; + op[ 1 ] = o1; + op[ 2 ] = o2; + + #else // LANCIR_ALIGN > 4 + + const int ci = kl >> 1; + + LANCIR_LF_PRE + + int c = ci; + + const float xx = flt[ 0 ]; + float sum0 = xx * ip[ 0 ]; + float sum1 = xx * ip[ 1 ]; + float sum2 = xx * ip[ 2 ]; + const float xx2 = flt[ 1 ]; + float sum3 = xx2 * ip[ 3 ]; + float sum4 = xx2 * ip[ 4 ]; + float sum5 = xx2 * ip[ 5 ]; + + while( --c != 0 ) + { + flt += 2; + ip += 6; + const float xx = flt[ 0 ]; + sum0 += xx * ip[ 0 ]; + sum1 += xx * ip[ 1 ]; + sum2 += xx * ip[ 2 ]; + const float xx2 = flt[ 1 ]; + sum3 += xx2 * ip[ 3 ]; + sum4 += xx2 * ip[ 4 ]; + sum5 += xx2 * ip[ 5 ]; + } + + op[ 0 ] = sum0 + sum3; + op[ 1 ] = sum1 + sum4; + op[ 2 ] = sum2 + sum5; + + #endif // LANCIR_ALIGN > 4 + + LANCIR_LF_POST + } + + template< bool UseSP > + static void resize4( const float* const sp, float* op, const size_t opinc, + const CResizePos* rp, const int kl, const int DstLen ) + { + #if LANCIR_ALIGN > 4 + const int ci = kl >> 1; + #else // LANCIR_ALIGN > 4 + const int ci = kl; + #endif // LANCIR_ALIGN > 4 + + LANCIR_LF_PRE + + int c = ci; + + #if defined( LANCIR_AVX ) + + __m256 sum = _mm256_mul_ps( _mm256_load_ps( flt ), + _mm256_loadu_ps( ip )); + + while( --c != 0 ) + { + flt += 8; + ip += 8; + sum = _mm256_add_ps( sum, _mm256_mul_ps( _mm256_load_ps( flt ), + _mm256_loadu_ps( ip ))); + } + + _mm_store_ps( op, _mm_add_ps( _mm256_extractf128_ps( sum, 0 ), + _mm256_extractf128_ps( sum, 1 ))); + + #elif defined( LANCIR_SSE2 ) + + __m128 sumA = _mm_mul_ps( _mm_load_ps( flt ), _mm_load_ps( ip )); + __m128 sumB = _mm_mul_ps( _mm_load_ps( flt + 4 ), + _mm_load_ps( ip + 4 )); + + while( --c != 0 ) + { + flt += 8; + ip += 8; + sumA = _mm_add_ps( sumA, _mm_mul_ps( _mm_load_ps( flt ), + _mm_load_ps( ip ))); + + sumB = _mm_add_ps( sumB, _mm_mul_ps( _mm_load_ps( flt + 4 ), + _mm_load_ps( ip + 4 ))); + } + + _mm_store_ps( op, _mm_add_ps( sumA, sumB )); + + #elif defined( LANCIR_NEON ) + + float32x4_t sumA = vmulq_f32( vld1q_f32( flt ), vld1q_f32( ip )); + float32x4_t sumB = vmulq_f32( vld1q_f32( flt + 4 ), + vld1q_f32( ip + 4 )); + + while( --c != 0 ) + { + flt += 8; + ip += 8; + sumA = vmlaq_f32( sumA, vld1q_f32( flt ), vld1q_f32( ip )); + sumB = vmlaq_f32( sumB, vld1q_f32( flt + 4 ), + vld1q_f32( ip + 4 )); + } + + vst1q_f32( op, vaddq_f32( sumA, sumB )); + + #else // defined( LANCIR_NEON ) + + const float xx = flt[ 0 ]; + float sum0 = xx * ip[ 0 ]; + float sum1 = xx * ip[ 1 ]; + float sum2 = xx * ip[ 2 ]; + float sum3 = xx * ip[ 3 ]; + + while( --c != 0 ) + { + flt++; + ip += 4; + const float xx = flt[ 0 ]; + sum0 += xx * ip[ 0 ]; + sum1 += xx * ip[ 1 ]; + sum2 += xx * ip[ 2 ]; + sum3 += xx * ip[ 3 ]; + } + + op[ 0 ] = sum0; + op[ 1 ] = sum1; + op[ 2 ] = sum2; + op[ 3 ] = sum3; + + #endif // defined( LANCIR_NEON ) + + LANCIR_LF_POST + } + + #undef LANCIR_LF_PRE + #undef LANCIR_LF_POST +}; + +#undef LANCIR_ALIGN + +} // namespace avir + +#endif // AVIR_CLANCIR_INCLUDED