Revision: 2744 Author: rowledge Date: 2013-06-18 16:13:50 -0700 (Tue, 18 Jun 2013) Log Message: ----------- add fast bitblt support files
Modified Paths: -------------- trunk/platforms/Cross/vm/sqMemoryAccess.h
Added Paths: ----------- trunk/platforms/Cross/plugins/BitBltPlugin/ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArm.c trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArm.h trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmLinux.c trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmOther.c trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.c trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.h trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAlphaBlend.s trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdBitLogical.s trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdPixPaint.s trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdSourceWord.s trunk/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.c trunk/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.h trunk/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c trunk/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.h trunk/platforms/Cross/plugins/BitBltPlugin/BitBltInternal.h
Added: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArm.c =================================================================== --- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArm.c (rev 0) +++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArm.c 2013-06-18 23:13:50 UTC (rev 2744) @@ -0,0 +1,36 @@ +/* + * Copyright © 2013 Raspberry Pi Foundation + * Copyright © 2013 RISC OS Open Ltd + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of the copyright holders not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission. The copyright holders make no + * representations about the suitability of this software for any purpose. It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + */ + +#include "BitBltArm.h" +#include "BitBltArmSimd.h" + +arm_cpu_features_t armCpuFeatures; + +void addArmFastPaths(void) +{ + armCpuFeatures = detectCpuFeatures(); + if (armCpuFeatures & ARM_V6) + addArmSimdFastPaths(); +}
Added: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArm.h =================================================================== --- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArm.h (rev 0) +++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArm.h 2013-06-18 23:13:50 UTC (rev 2744) @@ -0,0 +1,44 @@ +/* + * Copyright © 2013 Raspberry Pi Foundation + * Copyright © 2013 RISC OS Open Ltd + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of the copyright holders not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission. The copyright holders make no + * representations about the suitability of this software for any purpose. It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + */ + +#ifndef BITBLTARM_H_ +#define BITBLTARM_H_ + +typedef enum { + ARM_V7 = (1 << 0), + ARM_V6 = (1 << 1), + ARM_VFP = (1 << 2), + ARM_NEON = (1 << 3), + ARM_IWMMXT = (1 << 4) +} arm_cpu_features_t; + +extern arm_cpu_features_t armCpuFeatures; + +/* There's a separate implementation of this function for each OS */ +arm_cpu_features_t detectCpuFeatures(void); + +void addArmFastPaths(void); + +#endif /* BITBLTARM_H_ */
Added: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmLinux.c =================================================================== --- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmLinux.c (rev 0) +++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmLinux.c 2013-06-18 23:13:50 UTC (rev 2744) @@ -0,0 +1,78 @@ +/* + * Copyright © 2000 SuSE, Inc. + * Copyright © 2007 Red Hat, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of SuSE not be used in advertising or + * publicity pertaining to distribution of the software without specific, + * written prior permission. SuSE makes no representations about the + * suitability of this software for any purpose. It is provided "as is" + * without express or implied warranty. + * + * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE + * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +* WRT the usage in the Squeak Smalltalk system - +* This file provides a function to discover the cpu features supported at runtime; we assume +* you understand that 'arm_cpu_features' is meaningful only on ARM cpu machines. +* An equivalent file will be required for other ARM platforms; see BitBtArmOther.c in this directory + */ + +#include <unistd.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <fcntl.h> +#include <string.h> +#include <elf.h> + +#include "BitBltArm.h" + +arm_cpu_features_t detectCpuFeatures(void) +{ + arm_cpu_features_t features = 0; + Elf32_auxv_t aux; + int fd; + + fd = open ("/proc/self/auxv", O_RDONLY); + if (fd >= 0) + { + while (read (fd, &aux, sizeof(Elf32_auxv_t)) == sizeof(Elf32_auxv_t)) + { + if (aux.a_type == AT_HWCAP) + { + uint32_t hwcap = aux.a_un.a_val; + + /* hardcode these values to avoid depending on specific + * versions of the hwcap header, e.g. HWCAP_NEON + */ + if ((hwcap & 64) != 0) + features |= ARM_VFP; + if ((hwcap & 512) != 0) + features |= ARM_IWMMXT; + /* this flag is only present on kernel 2.6.29 */ + if ((hwcap & 4096) != 0) + features |= ARM_NEON; + } + else if (aux.a_type == AT_PLATFORM) + { + const char *plat = (const char*) aux.a_un.a_val; + + if (strncmp (plat, "v7l", 3) == 0) + features |= (ARM_V7 | ARM_V6); + else if (strncmp (plat, "v6l", 3) == 0) + features |= ARM_V6; + } + } + close (fd); + } + + return features; +}
Added: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmOther.c =================================================================== --- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmOther.c (rev 0) +++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmOther.c 2013-06-18 23:13:50 UTC (rev 2744) @@ -0,0 +1,38 @@ +/* + * Copyright © 2013 Raspberry Pi Foundation + * Copyright © 2013 RISC OS Open Ltd + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of the copyright holders not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission. The copyright holders make no + * representations about the suitability of this software for any purpose. It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + * WRT the usage in the Squeak Smalltalk system - +* This file provides a function to discover the cpu features supported at runtime; we assume +* you understand that 'arm_cpu_features' is meaningful only on ARM cpu machines. +* Obviously, this is a null function and a suitable equivalent file will be required for actual ARM platforms; +* see BitBtArmLinux.c in this directory as an example + */ + +#include "BitBltArm.h" + +/* There is no OS-neutral way of determining which type of ARM this is */ + +arm_cpu_features_t detectCpuFeatures(void) +{ + return 0; +}
Added: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.c =================================================================== --- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.c (rev 0) +++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.c 2013-06-18 23:13:50 UTC (rev 2744) @@ -0,0 +1,263 @@ +/* + * Copyright © 2013 Raspberry Pi Foundation + * Copyright © 2013 RISC OS Open Ltd + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of the copyright holders not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission. The copyright holders make no + * representations about the suitability of this software for any purpose. It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + */ + +#include <stddef.h> +#include <stdint.h> + +#include "BitBltInternal.h" + +enum { + HALFTONE_NONE, + HALFTONE_SCALAR, + HALFTONE_VECTOR +}; + +//typedef void (*armSimdAsmFn)(uint32_t width, uint32_t height, uint32_t *dst, uint32_t dstStride, uint32_t *src, uint32_t srcStride, uint32_t halftone, uint32_t halftoneInfo, uint32_t *colourMap, uint32_t bitPtrs, ...); + +#define FAST_PATH(op, src_bpp, dst_bpp, qualifier, halftone_type) \ +extern void armSimd##op##src_bpp##_##dst_bpp##qualifier##_wide (uint32_t width, uint32_t height, uint32_t *dst, uint32_t dstStride, uint32_t *src, uint32_t srcStride, uint32_t halftone, uint32_t halftoneInfo, uint32_t *colourMap, uint32_t bitPtrs, ...); \ +extern void armSimd##op##src_bpp##_##dst_bpp##qualifier##_narrow(uint32_t width, uint32_t height, uint32_t *dst, uint32_t dstStride, uint32_t *src, uint32_t srcStride, uint32_t halftone, uint32_t halftoneInfo, uint32_t *colourMap, uint32_t bitPtrs, ...); \ +extern void armSimd##op##src_bpp##_##dst_bpp##qualifier##_tiny (uint32_t width, uint32_t height, uint32_t *dst, uint32_t dstStride, uint32_t *src, uint32_t srcStride, uint32_t halftone, uint32_t halftoneInfo, uint32_t *colourMap, uint32_t bitPtrs, ...); \ +static void fastPath##op##src_bpp##_##dst_bpp##qualifier(operation_t *op, uint32_t flags) \ +{ \ + IGNORE(flags); \ + /* Copy certain parts of the operation structure to locals to help compiler */ \ + uint32_t *srcBits = op->src.bits; \ + uint32_t srcPitch = op->src.pitch / sizeof (uint32_t); \ + uint32_t srcX = op->src.x; \ + uint32_t srcY = op->src.y; \ + uint32_t *dstBits = op->dest.bits; \ + uint32_t dstPitch = op->dest.pitch / sizeof (uint32_t); \ + uint32_t dstX = op->dest.x; \ + uint32_t dstY = op->dest.y; \ + uint32_t width = op->width; \ + uint32_t height = op->height; \ + uint32_t *cmLookupTable = *op->cmLookupTable; \ + uint32_t halftoneHeight = op->halftoneHeight; \ + uint32_t *halftoneBase = (uint32_t *) *op->halftoneBase; \ + /* Get pointers to initial words */ \ + uint32_t *src = 0; \ + if (src_bpp > 0) \ + src = srcBits + srcPitch * srcY + srcX * src_bpp / 32; \ + uint32_t *dst = dstBits + dstPitch * dstY + dstX * dst_bpp / 32; \ + /* Get initial pixel offset within words, mangle into pitch if possible */ \ + uint32_t bitPtrs = 0; \ + uint32_t srcXpix = 0; \ + if (src_bpp > 0) { \ + srcXpix = srcX & (31 / (src_bpp == 0 ? 1 : src_bpp)); /* ?: to avoid compiler warning on GCC! */ \ + if (src_bpp < 8) \ + bitPtrs = srcXpix << 27; \ + else if (src_bpp == 8 || src_bpp == 16) \ + srcPitch |= srcXpix << 30; \ + } \ + uint32_t dstXpix = dstX & (31/dst_bpp); \ + if (dst_bpp < 8) \ + bitPtrs |= dstXpix; \ + else if (dst_bpp == 8 || dst_bpp == 16) \ + dstPitch |= dstXpix << 30; \ + /* Adjust strides to remove number of words partially or wholly read/written */ \ + if (src_bpp > 0) \ + srcPitch -= (src_bpp * (srcXpix + width) + 31) / 32; \ + dstPitch -= (dst_bpp * (dstXpix + width) + 31) / 32; \ + /* Deal with halftoning */ \ + uint32_t halftone = 0; \ + uint32_t halftoneInfo = 0; \ + if (halftone_type == HALFTONE_SCALAR) \ + halftone = halftoneBase[0]; \ + else if (halftone_type == HALFTONE_VECTOR) { \ + halftone = (uint32_t) (halftoneBase + halftoneHeight); \ + halftoneInfo = (((dstY % halftoneHeight) - halftoneHeight) << 17) | (-halftoneHeight & 0x7FFF); \ + } \ + /* Work out which width class this operation is. \ + * Rather than re-evaluate this for each line, we want one choice \ + * for the whole operation; this means we can't assume anything about \ + * alignment to sizes larger than 4 bytes, because that's the only \ + * guarantee we have about line stride. */ \ + if (width > (128-32)/dst_bpp && (((dstXpix-1) ^ (dstXpix+width-(128-32)/dst_bpp)) &~ (31/dst_bpp))) \ + armSimd##op##src_bpp##_##dst_bpp##qualifier##_wide(width, height, dst, dstPitch, src, srcPitch, halftone, halftoneInfo, cmLookupTable, bitPtrs); \ + else if (dst_bpp > 8 || (((dstXpix-1) ^ (dstXpix+width)) &~ (31/dst_bpp))) \ + armSimd##op##src_bpp##_##dst_bpp##qualifier##_narrow(width, height, dst, dstPitch, src, srcPitch, halftone, halftoneInfo, cmLookupTable, bitPtrs); \ + else \ + armSimd##op##src_bpp##_##dst_bpp##qualifier##_tiny(width, height, dst, dstPitch, src, srcPitch, halftone, halftoneInfo, cmLookupTable, bitPtrs); \ +} + +FAST_PATH(SourceWord,1,32,,HALFTONE_NONE) + +FAST_PATH(SourceWord,1,16,,HALFTONE_NONE) +FAST_PATH(SourceWord,2,32,,HALFTONE_NONE) + +FAST_PATH(SourceWord,1,8,,HALFTONE_NONE) +FAST_PATH(SourceWord,2,16,,HALFTONE_NONE) +FAST_PATH(SourceWord,4,32,,HALFTONE_NONE) + +FAST_PATH(SourceWord,1,4,,HALFTONE_NONE) +FAST_PATH(SourceWord,2,8,,HALFTONE_NONE) +FAST_PATH(SourceWord,4,16,,HALFTONE_NONE) +FAST_PATH(SourceWord,8,32,,HALFTONE_NONE) + +FAST_PATH(SourceWord,1,2,,HALFTONE_NONE) +FAST_PATH(SourceWord,2,4,,HALFTONE_NONE) +FAST_PATH(SourceWord,4,8,,HALFTONE_NONE) +FAST_PATH(SourceWord,8,16,,HALFTONE_NONE) +FAST_PATH(SourceWord,16,32,,HALFTONE_NONE) + +FAST_PATH(SourceWord,1,1,,HALFTONE_NONE) +FAST_PATH(SourceWord,2,2,,HALFTONE_NONE) +FAST_PATH(SourceWord,4,4,,HALFTONE_NONE) +FAST_PATH(SourceWord,8,8,,HALFTONE_NONE) +FAST_PATH(SourceWord,16,16,,HALFTONE_NONE) +FAST_PATH(SourceWord,32,32,,HALFTONE_NONE) + +FAST_PATH(SourceWord,2,1,,HALFTONE_NONE) +FAST_PATH(SourceWord,4,2,,HALFTONE_NONE) +FAST_PATH(SourceWord,8,4,,HALFTONE_NONE) +FAST_PATH(SourceWord,16,8,,HALFTONE_NONE) +FAST_PATH(SourceWord,32,16,,HALFTONE_NONE) + +FAST_PATH(SourceWord,4,1,,HALFTONE_NONE) +FAST_PATH(SourceWord,8,2,,HALFTONE_NONE) +FAST_PATH(SourceWord,16,4,,HALFTONE_NONE) +FAST_PATH(SourceWord,32,8,,HALFTONE_NONE) + +FAST_PATH(SourceWord,8,1,,HALFTONE_NONE) +FAST_PATH(SourceWord,16,2,,HALFTONE_NONE) +FAST_PATH(SourceWord,32,4,,HALFTONE_NONE) + +FAST_PATH(SourceWord,16,1,,HALFTONE_NONE) +FAST_PATH(SourceWord,32,2,,HALFTONE_NONE) + +FAST_PATH(SourceWord,32,1,,HALFTONE_NONE) + +FAST_PATH(SourceWord,0,1,,HALFTONE_NONE) +FAST_PATH(SourceWord,0,1,_scalar,HALFTONE_SCALAR) +FAST_PATH(SourceWord,0,2,,HALFTONE_NONE) +FAST_PATH(SourceWord,0,2,_scalar,HALFTONE_SCALAR) +FAST_PATH(SourceWord,0,4,,HALFTONE_NONE) +FAST_PATH(SourceWord,0,4,_scalar,HALFTONE_SCALAR) +FAST_PATH(SourceWord,0,8,,HALFTONE_NONE) +FAST_PATH(SourceWord,0,8,_scalar,HALFTONE_SCALAR) +FAST_PATH(SourceWord,0,16,,HALFTONE_NONE) +FAST_PATH(SourceWord,0,16,_scalar,HALFTONE_SCALAR) +FAST_PATH(SourceWord,0,32,,HALFTONE_NONE) +FAST_PATH(SourceWord,0,32,_scalar,HALFTONE_SCALAR) + +FAST_PATH(PixPaint,1,1,,HALFTONE_NONE) +FAST_PATH(PixPaint,2,2,,HALFTONE_NONE) +FAST_PATH(PixPaint,4,4,,HALFTONE_NONE) +FAST_PATH(PixPaint,8,8,,HALFTONE_NONE) +FAST_PATH(PixPaint,16,16,,HALFTONE_NONE) +FAST_PATH(PixPaint,32,32,,HALFTONE_NONE) + +FAST_PATH(AlphaBlend,32,32,,HALFTONE_NONE) + +FAST_PATH(BitAnd,1,1,,HALFTONE_NONE) +FAST_PATH(BitAnd,2,2,,HALFTONE_NONE) +FAST_PATH(BitAnd,4,4,,HALFTONE_NONE) +FAST_PATH(BitAnd,8,8,,HALFTONE_NONE) +FAST_PATH(BitAnd,16,16,,HALFTONE_NONE) +FAST_PATH(BitAnd,32,32,,HALFTONE_NONE) + +static fast_path_t fastPaths[] = { + { fastPathSourceWord1_32, CR_sourceWord, STD_FLAGS(1,32,DIRECT,NO) }, + + { fastPathSourceWord1_16, CR_sourceWord, STD_FLAGS(1,16,DIRECT,NO) }, + { fastPathSourceWord2_32, CR_sourceWord, STD_FLAGS(2,32,DIRECT,NO) }, + + { fastPathSourceWord1_8, CR_sourceWord, STD_FLAGS(1,8,DIRECT,NO) }, + { fastPathSourceWord2_16, CR_sourceWord, STD_FLAGS(2,16,DIRECT,NO) }, + { fastPathSourceWord4_32, CR_sourceWord, STD_FLAGS(4,32,DIRECT,NO) }, + + { fastPathSourceWord1_4, CR_sourceWord, STD_FLAGS(1,4,DIRECT,NO) }, + { fastPathSourceWord2_8, CR_sourceWord, STD_FLAGS(2,8,DIRECT,NO) }, + { fastPathSourceWord4_16, CR_sourceWord, STD_FLAGS(4,16,DIRECT,NO) }, + { fastPathSourceWord8_32, CR_sourceWord, STD_FLAGS(8,32,DIRECT,NO) }, + + { fastPathSourceWord1_2, CR_sourceWord, STD_FLAGS(1,2,DIRECT,NO) }, + { fastPathSourceWord2_4, CR_sourceWord, STD_FLAGS(2,4,DIRECT,NO) }, + { fastPathSourceWord4_8, CR_sourceWord, STD_FLAGS(4,8,DIRECT,NO) }, + { fastPathSourceWord8_16, CR_sourceWord, STD_FLAGS(8,16,DIRECT,NO) }, + { fastPathSourceWord16_32, CR_sourceWord, STD_FLAGS(16,32,NO,NO) }, + + { fastPathSourceWord1_1, CR_sourceWord, STD_FLAGS(1,1,NO,NO) }, + { fastPathSourceWord2_2, CR_sourceWord, STD_FLAGS(2,2,NO,NO) }, + { fastPathSourceWord4_4, CR_sourceWord, STD_FLAGS(4,4,NO,NO) }, + { fastPathSourceWord8_8, CR_sourceWord, STD_FLAGS(8,8,NO,NO) }, + { fastPathSourceWord16_16, CR_sourceWord, STD_FLAGS(16,16,NO,NO) }, + { fastPathSourceWord32_32, CR_sourceWord, STD_FLAGS(32,32,NO,NO) }, + + { fastPathSourceWord2_1, CR_sourceWord, STD_FLAGS(2,1,DIRECT,NO) }, + { fastPathSourceWord4_2, CR_sourceWord, STD_FLAGS(4,2,DIRECT,NO) }, + { fastPathSourceWord8_4, CR_sourceWord, STD_FLAGS(8,4,DIRECT,NO) }, + { fastPathSourceWord16_8, CR_sourceWord, STD_FLAGS(16,8,DIRECT,NO) }, + { fastPathSourceWord32_16, CR_sourceWord, STD_FLAGS(32,16,NO,NO) }, + + { fastPathSourceWord4_1, CR_sourceWord, STD_FLAGS(4,1,DIRECT,NO) }, + { fastPathSourceWord8_2, CR_sourceWord, STD_FLAGS(8,2,DIRECT,NO) }, + { fastPathSourceWord16_4, CR_sourceWord, STD_FLAGS(16,4,DIRECT,NO) }, + { fastPathSourceWord32_8, CR_sourceWord, STD_FLAGS(32,8,15BIT,NO) }, + + { fastPathSourceWord8_1, CR_sourceWord, STD_FLAGS(8,1,DIRECT,NO) }, + { fastPathSourceWord16_2, CR_sourceWord, STD_FLAGS(16,2,DIRECT,NO) }, + { fastPathSourceWord32_4, CR_sourceWord, STD_FLAGS(32,4,15BIT,NO) }, + + { fastPathSourceWord16_1, CR_sourceWord, STD_FLAGS(16,1,DIRECT,NO) }, + { fastPathSourceWord32_2, CR_sourceWord, STD_FLAGS(32,2,15BIT,NO) }, + + { fastPathSourceWord32_1, CR_sourceWord, STD_FLAGS(32,1,15BIT,NO) }, + + { fastPathSourceWord0_1, CR_sourceWord, STD_FLAGS_NO_SOURCE(1,NO) }, + { fastPathSourceWord0_1_scalar, CR_sourceWord, STD_FLAGS_NO_SOURCE(1,SCALAR) }, + { fastPathSourceWord0_2, CR_sourceWord, STD_FLAGS_NO_SOURCE(2,NO) }, + { fastPathSourceWord0_2_scalar, CR_sourceWord, STD_FLAGS_NO_SOURCE(2,SCALAR) }, + { fastPathSourceWord0_4, CR_sourceWord, STD_FLAGS_NO_SOURCE(4,NO) }, + { fastPathSourceWord0_4_scalar, CR_sourceWord, STD_FLAGS_NO_SOURCE(4,SCALAR) }, + { fastPathSourceWord0_8, CR_sourceWord, STD_FLAGS_NO_SOURCE(8,NO) }, + { fastPathSourceWord0_8_scalar, CR_sourceWord, STD_FLAGS_NO_SOURCE(8,SCALAR) }, + { fastPathSourceWord0_16, CR_sourceWord, STD_FLAGS_NO_SOURCE(16,NO) }, + { fastPathSourceWord0_16_scalar, CR_sourceWord, STD_FLAGS_NO_SOURCE(16,SCALAR) }, + { fastPathSourceWord0_32, CR_sourceWord, STD_FLAGS_NO_SOURCE(32,NO) }, + { fastPathSourceWord0_32_scalar, CR_sourceWord, STD_FLAGS_NO_SOURCE(32,SCALAR) }, + + { fastPathPixPaint1_1, CR_pixPaint, STD_FLAGS(1,1,NO,NO) }, + { fastPathPixPaint2_2, CR_pixPaint, STD_FLAGS(2,2,NO,NO) }, + { fastPathPixPaint4_4, CR_pixPaint, STD_FLAGS(4,4,NO,NO) }, + { fastPathPixPaint8_8, CR_pixPaint, STD_FLAGS(8,8,NO,NO) }, + { fastPathPixPaint16_16, CR_pixPaint, STD_FLAGS(16,16,NO,NO) }, + { fastPathPixPaint32_32, CR_pixPaint, STD_FLAGS(32,32,NO,NO) }, + + { fastPathAlphaBlend32_32, CR_alphaBlend, STD_FLAGS(32,32,NO,NO) }, + + { fastPathBitAnd1_1, CR_bitAnd, STD_FLAGS(1,1,NO,NO) }, + { fastPathBitAnd2_2, CR_bitAnd, STD_FLAGS(2,2,NO,NO) }, + { fastPathBitAnd4_4, CR_bitAnd, STD_FLAGS(4,4,NO,NO) }, + { fastPathBitAnd8_8, CR_bitAnd, STD_FLAGS(8,8,NO,NO) }, + { fastPathBitAnd16_16, CR_bitAnd, STD_FLAGS(16,16,NO,NO) }, + { fastPathBitAnd32_32, CR_bitAnd, STD_FLAGS(32,32,NO,NO) }, +}; + +void addArmSimdFastPaths(void) +{ + addFastPaths(fastPaths, sizeof fastPaths / sizeof *fastPaths); +}
Added: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.h =================================================================== --- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.h (rev 0) +++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.h 2013-06-18 23:13:50 UTC (rev 2744) @@ -0,0 +1,31 @@ +/* + * Copyright © 2013 Raspberry Pi Foundation + * Copyright © 2013 RISC OS Open Ltd + * + * Permission to use, copy, modify, distribute, and sell this software and its + * documentation for any purpose is hereby granted without fee, provided that + * the above copyright notice appear in all copies and that both that + * copyright notice and this permission notice appear in supporting + * documentation, and that the name of the copyright holders not be used in + * advertising or publicity pertaining to distribution of the software without + * specific, written prior permission. The copyright holders make no + * representations about the suitability of this software for any purpose. It + * is provided "as is" without express or implied warranty. + * + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + * + */ + +#ifndef BITBLTARMSIMD_H_ +#define BITBLTARMSIMD_H_ + +void addArmSimdFastPaths(void); + +#endif /* BITBLTARMSIMD_H_ */
Added: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAlphaBlend.s =================================================================== --- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAlphaBlend.s (rev 0) +++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAlphaBlend.s 2013-06-18 23:13:50 UTC (rev 2744) @@ -0,0 +1,141 @@ +; +; Copyright © 2013 Raspberry Pi Foundation +; Copyright © 2013 RISC OS Open Ltd +; +; Permission to use, copy, modify, distribute, and sell this software and its +; documentation for any purpose is hereby granted without fee, provided that +; the above copyright notice appear in all copies and that both that +; copyright notice and this permission notice appear in supporting +; documentation, and that the name of the copyright holders not be used in +; advertising or publicity pertaining to distribution of the software without +; specific, written prior permission. The copyright holders make no +; representations about the suitability of this software for any purpose. It +; is provided "as is" without express or implied warranty. +; +; THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS +; SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND +; FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY +; SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN +; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING +; OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +; SOFTWARE. +; + +; Debug options + GBLL DebugData +;DebugData SETL {TRUE} + GBLL DebugPld +;DebugPld SETL {TRUE} + GBLL VerboseBuild +;VerboseBuild SETL {TRUE} + + GET BitBltArmSimdAsm.hdr + + AREA |BitBltArmSimdAlphaBlend$$Code|, CODE, READONLY + ARM + +; ******************************************************************** + + MACRO + AlphaBlend32_32_init + MOV ht_info, #1 + MOV ht, #0 + ORR ht_info, ht_info, ht_info, LSL #16 ; &10001 + MEND + + MACRO + AlphaBlend32_32_1pixel $src, $dst, $tmp0, $tmp1, $tmp2, $known_not_transp + [ "$known_not_transp" = "" + MOVS $tmp2, $src, LSR #24 ; s_a + BEQ %FT09 ; fully transparent - use dst + ] + TEQ $tmp2, #&FF + BEQ %FT10 ; fully opaque - use src + UXTB $tmp0, $src, ROR #8 ; s_ag + ORR $tmp0, $tmp0, #&FF0000 + UXTB16 $tmp1, $src ; s_rb + MUL $tmp0, $tmp0, $tmp2 + MUL $tmp1, $tmp1, $tmp2 + RSB $tmp2, $tmp2, #&FF + UXTB16 $src, $dst, ROR #8 ; d_ag + UXTB16 $dst, $dst ; d_rb + MLA $src, $src, $tmp2, $tmp0 ; ag + MLA $dst, $dst, $tmp2, $tmp1 ; rb + USUB16 $tmp0, $src, ht_info + UXTAB16 $src, $src, $src, ROR #8 + SEL $tmp1, ht_info, ht + UXTAB16 $src, $tmp1, $src, ROR #8 + USUB16 $tmp0, $dst, ht_info + UXTAB16 $dst, $dst, $dst, ROR #8 + SEL $tmp1, ht_info, ht + UXTAB16 $dst, $tmp1, $dst, ROR #8 + ORR $src, $dst, $src, LSL #8 ; recombine + B %FT10 +09 MOV $src, $dst +10 + MEND + + MACRO + AlphaBlend32_32_32bits $src, $dst, $fixed_skew + Read1Word src, 0, carry, $fixed_skew, skew, scratch + ADD dst, dst, #1*4 + MOVS $wk7, $wk0, LSR #24 + BEQ %FT01 ; all pixels fully transparent - don't touch destination + LDR $wk4, [dst, #-1*4] + AlphaBlend32_32_1pixel $wk0, $wk4, $wk5, $wk6, $wk7, known_not_transp + Write1Word dst, 0 +01 + MEND + + MACRO + AlphaBlend32_32_64bits $src, $fixed_skew + Read2Words src, 0, carry, $fixed_skew, skew, scratch + ADD dst, dst, #2*4 + MOVS $wk7, $wk0, LSR #24 + MOVEQS $wk7, $wk1, LSR #24 + BEQ %FT01 ; all pixels fully transparent - don't touch destination + LDR $wk4, [dst, #-2*4] + AlphaBlend32_32_1pixel $wk0, $wk4, $wk5, $wk6, $wk7 + LDR $wk4, [dst, #-1*4] + AlphaBlend32_32_1pixel $wk1, $wk4, $wk5, $wk6, $wk7 + Write2Words dst, 0 +01 + MEND + + MACRO + AlphaBlend32_32_128bits_head $src, $fixed_skew, $intra_preloads + Read4Words src, 0, carry, $fixed_skew, skew, scratch + MEND + + MACRO + AlphaBlend32_32_128bits_tail $src + ADD dst, dst, #4*4 + MOVS $wk7, $wk0, LSR #24 + MOVEQS $wk7, $wk1, LSR #24 + MOVEQS $wk7, $wk2, LSR #24 + MOVEQS $wk7, $wk3, LSR #24 + BEQ %FT01 ; all pixels fully transparent - don't touch destination + LDR $wk4, [dst, #-4*4] + AlphaBlend32_32_1pixel $wk0, $wk4, $wk5, $wk6, $wk7 + LDR $wk4, [dst, #-3*4] + AlphaBlend32_32_1pixel $wk1, $wk4, $wk5, $wk6, $wk7 + LDR $wk4, [dst, #-2*4] + AlphaBlend32_32_1pixel $wk2, $wk4, $wk5, $wk6, $wk7 + LDR $wk4, [dst, #-1*4] + AlphaBlend32_32_1pixel $wk3, $wk4, $wk5, $wk6, $wk7 + Write4Words dst, 0 +01 + MEND + +;$op GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance, +; $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $cleanup + +AlphaBlend GenerateFunctions 32, 32,, \ + FLAG_DST_READWRITE :OR: FLAG_SPILL_LINE_VARS :OR: FLAG_PROCESS_PARALLEL :OR: FLAG_NO_PRELOAD_DST, 1, \ + "stride_d,stride_s,map,bitptrs,skew,orig_w,scratch,carry", \ + "x,stride_d,stride_s", bitptrs,, init ; leading_pixels_reg = wk3 + +; ******************************************************************** + + END
Added: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr =================================================================== --- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr (rev 0) +++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr 2013-06-18 23:13:50 UTC (rev 2744) @@ -0,0 +1,1939 @@ +; +; Copyright © 2013 Raspberry Pi Foundation +; Copyright © 2013 RISC OS Open Ltd +; +; Permission to use, copy, modify, distribute, and sell this software and its +; documentation for any purpose is hereby granted without fee, provided that +; the above copyright notice appear in all copies and that both that +; copyright notice and this permission notice appear in supporting +; documentation, and that the name of the copyright holders not be used in +; advertising or publicity pertaining to distribution of the software without +; specific, written prior permission. The copyright holders make no +; representations about the suitability of this software for any purpose. It +; is provided "as is" without express or implied warranty. +; +; THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS +; SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND +; FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY +; SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN +; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING +; OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +; SOFTWARE. +; + + [ :LNOT: :DEF: DebugData + GBLL DebugData + ] + [ :LNOT: :DEF: DebugPld + GBLL DebugPld + ] + [ :LNOT: :DEF: VerboseBuild + GBLL VerboseBuild + ] + +; Flag bitfield definitions +FLAG_NO_HALFTONE * 0 :SHL: 0 +FLAG_SCALAR_HALFTONE * 1 :SHL: 0 +FLAG_VECTOR_HALFTONE * 2 :SHL: 0 +FLAG_NO_COLOUR_MAP * 0 :SHL: 2 +FLAG_COLOUR_MAP * 1 :SHL: 2 + +FLAG_DST_WRITEONLY * 0 :SHL: 3 +FLAG_DST_READWRITE * 1 :SHL: 3 +FLAG_SPILL_NO_LINE_VARS * 0 :SHL: 4 +FLAG_SPILL_LINE_VARS_WIDE * 1 :SHL: 4 +FLAG_SPILL_LINE_VARS_NON_WIDE * 2 :SHL: 4 +FLAG_SPILL_LINE_VARS * 3 :SHL: 4 +FLAG_EXPAND_SKEW * 0 :SHL: 6 +FLAG_NO_EXPAND_SKEW * 1 :SHL: 6 +FLAG_PROCESS_SERIAL * 0 :SHL: 7 ; sub-word data is presented MS-aligned, and results are expected LS-aligned +FLAG_PROCESS_PARALLEL * 1 :SHL: 7 ; sub-word data retains its original alignment throughout (only useful if src & dest depths same) +FLAG_MAX_128BIT_MACRO * 0 :SHL: 8 +FLAG_MAX_256BIT_MACRO * 1 :SHL: 8 ; particularly tight loops can sometimes benefit from being unrolled to allow 2x 128-bit blocks to be staggered +FLAG_PRELOAD_DST * 0 :SHL: 9 +FLAG_NO_PRELOAD_DST * 1 :SHL: 9 + +; Offsets into stack + GBLA args_stack_offset +args_stack_offset SETA 9*4 + GBLA locals_stack_offset +locals_stack_offset SETA 0 + +; Top-level macro arguments are held in variables for convenience + GBLA src_bpp + GBLA dst_w_bpp + GBLA flags + GBLA prefetch_distance + GBLS leading_pixels_reg + GBLS preload_offset_reg + GBLS line_saved_regs + GBLS init + GBLS newline + GBLS reinitwk + GBLS cleanup +; Derived values + GBLS prefix + GBLA dst_r_bpp + GBLA src_bpp_shift + GBLA dst_bpp_shift + GBLL sub_byte + GBLA num_line_saved_regs + GBLA pix_per_block + +; Work registers - variables so they can be reassigned between functions +; (should always be assigned in increasing register number though) + GBLA wk0_num + GBLA wk1_num + GBLA wk2_num + GBLA wk3_num + GBLA wk4_num + GBLA wk5_num + GBLA wk6_num + GBLA wk7_num + GBLA wk8_num + GBLA wk9_num + GBLA wk10_num +; String versions of the same + GBLS wk0 + GBLS wk1 + GBLS wk2 + GBLS wk3 + GBLS wk4 + GBLS wk5 + GBLS wk6 + GBLS wk7 + GBLS wk8 + GBLS wk9 + GBLS wk10 + + + [ DebugData :LOR: DebugPld + IMPORT printf + ] + GBLL PrintAtStartOfLine +PrintAtStartOfLine SETL {TRUE} + MACRO + Print$cond $switch, $fmt, $reg0, $reg1, $reg2 + [ Debug$switch + [ "$cond" <> "" :LAND: "$cond" <> "AL" + LCLS opp +opp SETS :REVERSE_CC: "$cond" + B$opp %FT82 + ] + PUSH {r12,r14} + PUSH {r0-r12} + ADD ip, sp, #15*4 + STR ip, [sp, #13*4] + MRS v1, CPSR + [ "$reg0" <> "" + LDR a2, [sp, #:RCONST:$reg0 * 4] + ] + [ "$reg1" <> "" + LDR a3, [sp, #:RCONST:$reg1 * 4] + ] + [ "$reg2" <> "" + LDR a4, [sp, #:RCONST:$reg2 * 4] + ] + ADR a1, %FT80 + ADR lr, %FT81 + B printf +80 + [ PrintAtStartOfLine + = "$switch: " + ] + = "$fmt", 0 +PrintAtStartOfLine SETL "$fmt" :RIGHT: 1 = "\n" + ALIGN +81 MSR CPSR_cxsf, v1 + POP {r0-r12} + ADD sp, sp, #4 + POP {r14} +82 + ] + MEND + + [ :LNOT: :DEF: |objasm$version| :LAND: :LNOT: :DEF: |ads$version| + ; Assume asasm, which is lacking a number of key opcodes + ; Note there's a bug in asasm, the CC_ENCODING value shouldn't need shifting + + MACRO +$label SEL$cond $Rd, $Rn, $Rm +$label DCI :CC_ENCODING:"$cond":SHL:28 :OR: &06800FB0 :OR: (:RCONST:$Rn :SHL: 16) :OR: (:RCONST:$Rd :SHL: 12) :OR: (:RCONST:$Rm :SHL: 0) + MEND + + MACRO +$label UADD8$cond $Rd, $Rn, $Rm +$label DCI :CC_ENCODING:"$cond":SHL:28 :OR: &06500F90 :OR: (:RCONST:$Rn :SHL: 16) :OR: (:RCONST:$Rd :SHL: 12) :OR: (:RCONST:$Rm :SHL: 0) + MEND + + MACRO +$label USUB8$cond $Rd, $Rn, $Rm +$label DCI :CC_ENCODING:"$cond":SHL:28 :OR: &06500FF0 :OR: (:RCONST:$Rn :SHL: 16) :OR: (:RCONST:$Rd :SHL: 12) :OR: (:RCONST:$Rm :SHL: 0) + MEND + + MACRO +$label USUB16$cond $Rd, $Rn, $Rm +$label DCI :CC_ENCODING:"$cond":SHL:28 :OR: &06500F70 :OR: (:RCONST:$Rn :SHL: 16) :OR: (:RCONST:$Rd :SHL: 12) :OR: (:RCONST:$Rm :SHL: 0) + MEND + + MACRO +$label SETEND $endian + IF "$endian" = "LE" +$label DCI &F1010000 + ELIF "$endian" = "BE" +$label DCI &F1010200 + ELSE + ! 1, "Unrecognised SETEND endianness" + ENDIF + MEND + + ] + +; Add a constant, using a minimal number of ARM instructions +; Doesn't handle cases where bit 31 of constant is set, but we're not expecting any of those + MACRO +$lab AddL $dst, $src, $const + LCLA tmp +tmp SETA $const +tmp SETA tmp :OR: (((tmp :AND: &55555555) :SHL: 1) + ((tmp :AND: &AAAAAAAA) :SHR: 1)) + LCLA lsb +lsb SETA tmp :AND::NOT: (tmp-1) +tmp SETA tmp :OR: (tmp :SHR: 2) +tmp SETA tmp :OR: (tmp :SHR: 4) +tmp SETA tmp :OR: (tmp :SHR: 8) +tmp SETA tmp :OR: (tmp :SHR: 16) + LCLA msb +msb SETA (tmp+1) :AND::NOT: tmp + LCLS reg +reg SETS "$src" +$lab + WHILE lsb < msb + ADD $dst, $reg, #($const) :AND: (lsb * &FF) +lsb SETA lsb * 256 +reg SETS "$dst" + WEND + MEND + +; Find log2 of a variable + MACRO +$out Log2 $in + [ $in = 0 +$out SETA -1 + | + LCLA tmp +tmp SETA $in +$out SETA 0 + WHILE tmp > 1 +tmp SETA tmp / 2 +$out SETA $out + 1 + WEND + ] + MEND + +; Find max of two numbers + MACRO +$out Max $a, $b + [ $a > $b +$out SETA $a + | +$out SETA $b + ] + MEND + +; Find if an integer is the last in a group of a power-of-2 integers + MACRO +$result IsEndOfGroup $index, $size + LCLA index +index SETA $index + LCLA size +size SETA $size + [ size < 2 +$result SETL {TRUE} + | +$result SETL (index :AND::NOT: (index + 1)) :AND: (size / 2) > 0 + ] + MEND + +; Convert an integer to a decimal string + MACRO +$str DecimalStr $num + LCLA n +n SETA $num +$str SETS "" + WHILE n <> 0 +$str SETS :CHR:(48 + n % 10) :CC: $str +n SETA n / 10 + WEND + IF :LEN: $str = 0 +$str SETS "0" + ENDIF + MEND + +; Convert a wk register index into the name of the physical register + MACRO +$str LookupWk $index + LCLS wk +wk DecimalStr $index +wk SETS "wk$wk" +$str SETS $wk + MEND + +; Assign the wk registers from a list of registers + MACRO + AssignWk $list + LCLA wk_num + LCLS wk + LCLS tail + LCLS reg +wk_num SETA 0 +tail SETS "$list," + WHILE :LEN: tail > 0 +wk DecimalStr wk_num +wk_num SETA wk_num + 1 +reg SETS "" + WHILE tail :LEFT: 1 <> "," +reg SETS reg :CC: (tail :LEFT: 1) +tail SETS tail :RIGHT: (:LEN:tail - 1) + WEND +tail SETS tail :RIGHT: (:LEN:tail - 1) +wk$wk._num SETA :RCONST: $reg +wk$wk DecimalStr wk$wk._num +wk$wk SETS "r" :CC: wk$wk + WEND + ; Ensure the remaining ones aren't used + WHILE wk_num <= 10 +wk DecimalStr wk_num +wk_num SETA wk_num + 1 +wk$wk._num SETA -1 +wk$wk SETS "invalid_register_wk$wk" + WEND + MEND + +; See if a given register name is in a comma-separated list of registers + MACRO +$out RegIsInList $reg, $list + LCLS tail +tail SETS "$list," + WHILE :LEN: tail > 0 + [ :LEN: "$reg," <= :LEN: tail + [ "$reg," = tail :LEFT: :LEN: "$reg," +$out SETL {TRUE} + MEXIT + ] + ] + WHILE tail :LEFT: 1 <> "," +tail SETS tail :RIGHT: (:LEN:tail - 1) + WEND +tail SETS tail :RIGHT: (:LEN:tail - 1) + WEND +$out SETL {FALSE} + MEND + +; Count how many registers are in a comma-separated list of registers + MACRO +$out CountRegsInList $list +$out SETA 1 + LCLS tail +tail SETS "$list" + WHILE :LEN: tail > 0 + [ tail :LEFT: 1 = "," +$out SETA $out + 1 + ] +tail SETS tail :RIGHT: (:LEN:tail -1) + WEND + MEND + +; Data read macros + + MACRO +$lab ReadFirstSubWord $base, $data, $carry, $pixels, $fixed_skew, $skew, $tmp +$lab + [ src_bpp > 0 :LAND: src_bpp < 32 + LCLS reg0 +reg0 LookupWk $data + IF (flags :AND: FLAG_NO_EXPAND_SKEW) > 0 + [ "$pixels" <> "#0" + AND $tmp, $pixels, #32/src_bpp - 1 + CMP $tmp, $skew, LSR #src_bpp_shift + PrintHI Data, "ReadFirstSubWord: left@%p", $base + LDRHI $reg0, [$base], #4 + PrintHI Data, " %08X\n", $reg0 + ] + CMP $skew, #0 + PrintHI Data, "ReadFirstSubWord: right@%p", $base + LDRHI $carry, [$base], #4 + PrintHI Data, " %08X\n", $carry + CMP $tmp, #0 + BEQ %FT01 + RSB $tmp, $skew, #32 + MOV $reg0, $reg0, LSL $skew + ORR $reg0, $reg0, $carry, LSR $tmp + Print Data, "ReadFirstSubWord: skew %u -> %08X\n", $skew, $reg0 + [ flags :AND: FLAG_PROCESS_PARALLEL = 0 :LAND: "$pixels" <> "#0" + AND $tmp, $pixels, #32/src_bpp - 1 + MOV $tmp, $tmp, LSL #src_bpp_shift + MOV $reg0, $reg0, ROR $tmp + ] +01 + ELIF $fixed_skew == 0 + [ "$pixels" <> "#0" + ANDS $tmp, $pixels, #32/src_bpp - 1 + [ flags :AND: FLAG_PROCESS_PARALLEL = 0 + BEQ %FT01 + Print Data, "ReadFirstSubWord: left@%p", $base + LDR $reg0, [$base], #4 + Print Data, " %08X\n", $reg0 + MOV $tmp, $tmp, LSL #src_bpp_shift + MOV $reg0, $reg0, ROR $tmp +01 + | + PrintNE Data, "ReadFirstSubWord: left@%p", $base + LDRNE $reg0, [$base], #4 + PrintNE Data, " %08X\n", $reg0 + ] + ] + ELSE + [ "$pixels" <> "#0" + AND $tmp, $pixels, #32/src_bpp - 1 + CMP $tmp, #$fixed_skew/src_bpp + PrintHI Data, "ReadFirstSubWord: left@%p", $base + LDRHI $reg0, [$base], #4 + PrintHI Data, " %08X\n", $reg0 + ] + Print Data, "ReadFirstSubWord: right@%p", $base + LDR $carry, [$base], #4 + Print Data, " %08X\n", $carry + CMP $tmp, #0 + BEQ %FT01 + MOV $reg0, $reg0, LSL #$fixed_skew + ORR $reg0, $reg0, $carry, LSR #32-$fixed_skew + Print Data, "ReadFirstSubWord: skew $fixed_skew -> %08X\n", $reg0 + [ flags :AND: FLAG_PROCESS_PARALLEL = 0 + MOV $tmp, $tmp, LSL #src_bpp_shift + MOV $reg0, $reg0, ROR $tmp + ] +01 + ENDIF + ] + MEND + + MACRO +$lab ReadLastSubWord $base, $data, $carry, $pixels, $fixed_skew, $skew, $tmp +$lab + [ src_bpp > 0 :LAND: src_bpp < 32 + LCLS reg0 +reg0 LookupWk $data + IF (flags :AND: FLAG_NO_EXPAND_SKEW) > 0 + CMP $skew, #0 + BHI %FT01 + TST $pixels, #32/src_bpp - 1 + PrintNE Data, "ReadLastSubWord: next@%p", $base + LDRNE $reg0, [$base], #4 + PrintNE Data, " %08X\n", $reg0 + B %FT02 +01 + Print Data, "ReadLastSubWord: left %08X\n", $carry + MOV $reg0, $carry, LSL $skew + AND $tmp, $pixels, #32/src_bpp - 1 + RSB $tmp, $tmp, #32/src_bpp + CMP $tmp, $skew, LSR #src_bpp_shift + BHS %FT02 + Print Data, "ReadLastSubWord: right@%p", $base + LDR $carry, [$base], #4
@@ Diff output truncated at 50000 characters. @@
vm-dev@lists.squeakfoundation.org