mirror of
https://gitlab.freedesktop.org/gstreamer/gstreamer.git
synced 2025-01-01 04:58:47 +00:00
336 lines
10 KiB
ArmAsm
336 lines
10 KiB
ArmAsm
; PowerPC optimized zoom for Goom
|
|
; © 2001-2003 Guillaume Borios
|
|
; This library is free software; you can redistribute it and/or
|
|
; modify it under the terms of the GNU Library General Public
|
|
; License as published by the Free Software Foundation; either
|
|
; version 2 of the License, or (at your option) any later version.
|
|
;
|
|
; This library is distributed in the hope that it will be useful,
|
|
; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
; Library General Public License for more details.
|
|
;
|
|
; You should have received a copy of the GNU Library General Public
|
|
; License along with this library; if not, write to the
|
|
; Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
|
|
; Boston, MA 02110-1301, USA.
|
|
|
|
; Change log :
|
|
; 21 Dec 2003 : Use of altivec is now determined with a parameter
|
|
|
|
; Section definition : We use a read only section
|
|
.text
|
|
|
|
; name of the function to call by C program : ppc_zoom
|
|
; We declare this label as a global to extend its scope outside this file
|
|
.globl _ppc_zoom_generic
|
|
.globl _ppc_zoom_G4
|
|
|
|
; Description :
|
|
; This routine dynamically computes and applies a zoom filter
|
|
|
|
; parameters :
|
|
; r3 <=> unsigned int sizeX (in pixels)
|
|
; r4 <=> unsigned int sizeY (in pixels)
|
|
; r5 <=> unsigned int * frompixmap
|
|
; r6 <=> unsigned int * topixmap
|
|
; r7 <=> unsigned int * brutS
|
|
; r8 <=> unsigned int * brutD
|
|
; r9 <=> unsigned int buffratio
|
|
; r10 <=> int [16][16] precalccoeffs
|
|
|
|
; globals after init
|
|
; r5 <=> frompixmap - 1 byte needed for preincremental fetch (replaces r5)
|
|
; r6 <=> topixmap - 1 byte needed for preincremental fetch (replaces r6)
|
|
; r3 <=> ax = x max in 16th of pixels (replaces old r3)
|
|
; r4 <=> ay = y max in 16th of pixels (replaces old r4)
|
|
; r20 <=> row size in bytes
|
|
; r12 <=> 0xFF00FF (mask for parallel 32 bits pixs computing)
|
|
; r30 <=> brutS - 1 byte needed for preincremental fetch (replaces r7)
|
|
; r31 <=> brutD - 1 byte needed for preincremental fetch (replaces r8)
|
|
|
|
; ABI notes :
|
|
; r1 is the Stack Pointer (SP) => Do not use
|
|
; r13..r31 are non-volatiles => Do not use
|
|
|
|
_ppc_zoom_generic:
|
|
|
|
; Saves the used non volatile registers in the Mach-O stack s Red-Zone
|
|
stmw r18,-56(r1)
|
|
|
|
; init
|
|
li r18,0 ; Default value if out of range : 0 (Black)
|
|
mr r11,r10
|
|
lis r12,0xFF
|
|
mullw r2,r3,r4 ; Number of pixels to compute
|
|
subi r30,r8,0
|
|
slwi r20,r3,2
|
|
srawi r19,r20,2
|
|
ori r12,r12,0xFF
|
|
subi r3,r3,1
|
|
subi r4,r4,1
|
|
mtspr ctr,r2 ; Init the loop count (one loop per pixel computed)
|
|
subi r31,r7,0
|
|
subi r6,r6,4
|
|
slwi r3,r3,4
|
|
slwi r4,r4,4
|
|
|
|
;pre init for loop
|
|
lwz r2,0(r31) ; px
|
|
lwz r29,4(r31) ; py
|
|
lwz r8,0(r30) ; px2
|
|
lwz r10,4(r30) ; py2
|
|
|
|
b L1
|
|
.align 5
|
|
L1:
|
|
|
|
; computes dynamically the position to fetch
|
|
sub r8,r8,r2
|
|
sub r10,r10,r29
|
|
mullw r8,r8,r9
|
|
addi r31,r31,8
|
|
mullw r10,r10,r9
|
|
addi r30,r30,8
|
|
|
|
srawi r8,r8,16
|
|
srawi r10,r10,16
|
|
add r2,r2,r8
|
|
add r29,r29,r10
|
|
|
|
; if px>ax or py>ay goto outofrange
|
|
; computes the attenuation coeffs and the original point address
|
|
rlwinm r10,r2,6,28-6,31-6 ; r10 <- (r2 << 2) & 0x000002D0 (r10=(r2%16)*4*16)
|
|
cmpl cr4,0,r2,r3
|
|
rlwimi r10, r29, 2, 28-2, 31-2 ; r10 <- ((r29 << 2) & 0x0000002D) | (r10 & !0x0000002D) (r10=(r10%16)*4 | r10)
|
|
cmpl cr7,0,r29,r4
|
|
srawi r29,r29,4 ; pos computing
|
|
bge- cr4,L4
|
|
srawi r2,r2,4 ; pos computing
|
|
mullw r29, r29,r19 ; pos computing
|
|
bge- cr7,L4
|
|
|
|
; Channels notation : 00112233 (AARRVVBB)
|
|
|
|
add r2,r2,r29 ; pos computing
|
|
lwzx r10,r11,r10 ; Loads coefs
|
|
slwi r2,r2,2 ; pos computing
|
|
add r2,r2,r5 ; pos computing
|
|
rlwinm r21,r10,0,24,31 ; Isolates coef1 (??????11 -> 00000011)
|
|
lwz r25,0(r2) ; Loads col1 -> r25
|
|
lwz r26,4(r2) ; Loads col2 -> r26
|
|
rlwinm r22,r10,24,24,31 ; Isolates coef2 (????22?? -> 00000022)
|
|
rlwinm r23,r10,16,24,31 ; Isolates coef3 (??33???? -> 00000033)
|
|
add r2,r2,r20 ; Adds one line for future load of col3 and col4
|
|
and r8, r25,r12 ; Masks col1 channels 1 & 3 : 0x00XX00XX
|
|
rlwinm r24,r10,8,24,31 ; Isolates coef4 (44?????? -> 00000044)
|
|
andi. r25,r25,0xFF00 ; Masks col1 channel 2 : 0x0000XX00
|
|
mullw r8, r8, r21 ; Applies coef1 on col1 channels 1 & 3
|
|
|
|
|
|
; computes final pixel color
|
|
and r10,r26,r12 ; Masks col2 channels 1 & 3 : 0x00XX00XX
|
|
lwz r27,0(r2) ; Loads col3 -> r27
|
|
mullw r10,r10,r22 ; Applies coef2 on col2 channels 1 & 3
|
|
mullw r25,r25,r21 ; Applies coef1 on col1 channel 2
|
|
andi. r29,r26,0xFF00 ; Masks col2 channel 2 : 0x0000XX00
|
|
mullw r29,r29,r22 ; Applies coef2 on col2 channel 2
|
|
lwz r28,4(r2) ; Loads col4 -> r28
|
|
add r8 ,r8 ,r10 ; Adds col1 & col2 channels 1 & 3
|
|
and r10,r27,r12 ; Masks col3 channels 1 & 3 : 0x00XX00XX
|
|
add r25,r25,r29 ; Adds col1 & col2 channel 2
|
|
mullw r10,r10,r23 ; Applies coef3 on col3 channels 1 & 3
|
|
andi. r29,r27,0xFF00 ; Masks col3 channel 2 : 0x0000XX00
|
|
mullw r29,r29,r23 ; Applies coef3 on col3 channel 2
|
|
lwz r2,0(r31) ; px
|
|
add r7 ,r8 ,r10 ; Adds col3 to (col1 + col2) channels 1 & 3
|
|
and r10,r28,r12 ; Masks col4 channels 1 & 3 : 0x00XX00XX
|
|
mullw r10,r10,r24 ; Applies coef4 on col4 channels 1 & 3
|
|
add r25,r25,r29 ; Adds col 3 to (col1 + col2) channel 2
|
|
lwz r8,0(r30) ; px2
|
|
andi. r28,r28,0xFF00 ; Masks col4 channel 2 : 0x0000XX00
|
|
add r7 ,r7 ,r10 ; Adds col4 to (col1 + col2 + col3) channels 1 & 3
|
|
lwz r10,4(r30) ; py2
|
|
mullw r28,r28,r24 ; Applies coef4 on col4 channel 2
|
|
srawi r7, r7, 8 ; (sum of channels 1 & 3) >> 8
|
|
lwz r29,4(r31) ; py
|
|
add r25,r25,r28 ; Adds col 4 to (col1 + col2 + col3) channel 2
|
|
rlwimi r7, r25, 24, 16, 23 ; (((sum of channels 2) >> 8 ) & 0x0000FF00) | ((sum of channels 1 and 3) & 0xFFFF00FF)
|
|
stwu r7,4(r6) ; Stores the computed pixel
|
|
bdnz L1 ; Iterate again if needed
|
|
b L3 ;goto end ; If not, returns from the function
|
|
|
|
|
|
; if out of range
|
|
L4:
|
|
stwu r18,4(r6)
|
|
lwz r8,0(r30) ; px2
|
|
lwz r10,4(r30) ; py2
|
|
lwz r2,0(r31) ; px
|
|
lwz r29,4(r31) ; py
|
|
bdnz L1
|
|
|
|
|
|
L3:
|
|
|
|
; Restore saved registers and return
|
|
lmw r18,-56(r1)
|
|
blr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_ppc_zoom_G4:
|
|
|
|
; Saves the used non volatile registers in the Mach-O stack s Red-Zone
|
|
stmw r17,-60(r1)
|
|
|
|
; init
|
|
li r18,0 ; Default value if out of range : 0 (Black)
|
|
mr r11,r10
|
|
lis r12,0xFF
|
|
mullw r2,r3,r4 ; Number of pixels to compute
|
|
subi r30,r8,0
|
|
slwi r20,r3,2
|
|
srawi r19,r20,2
|
|
ori r12,r12,0xFF
|
|
subi r3,r3,1
|
|
subi r4,r4,1
|
|
mtspr ctr,r2 ; Init the loop count (one loop per pixel computed)
|
|
subi r31,r7,0
|
|
subi r6,r6,4
|
|
slwi r3,r3,4
|
|
slwi r4,r4,4
|
|
|
|
;pre init for loop
|
|
lwz r2,0(r31) ; px
|
|
lwz r29,4(r31) ; py
|
|
lwz r8,0(r30) ; px2
|
|
lwz r10,4(r30) ; py2
|
|
|
|
;*********************
|
|
lis r17,0x0F01
|
|
|
|
b L100
|
|
.align 5
|
|
L100:
|
|
|
|
addi r6,r6,4
|
|
|
|
; Optimization to ensure the destination buffer
|
|
; won't be loaded into the data cache
|
|
rlwinm. r0,r6,0,27,31
|
|
bne+ L500
|
|
dcbz 0,r6
|
|
;dcba 0,r6
|
|
L500:
|
|
|
|
; computes dynamically the position to fetch
|
|
;mullw r8,r8,r29
|
|
;mullw r2,r2,r29
|
|
;add r2,r8,r2
|
|
;srawi r2,r2,17
|
|
|
|
sub r8,r8,r2
|
|
sub r10,r10,r29
|
|
mullw r8,r8,r9
|
|
addi r31,r31,8
|
|
mullw r10,r10,r9
|
|
addi r30,r30,8
|
|
|
|
dst r30,r17,0
|
|
|
|
srawi r8,r8,16
|
|
srawi r10,r10,16
|
|
add r2,r2,r8
|
|
add r29,r29,r10
|
|
|
|
dst r31,r17,1
|
|
|
|
; if px>ax or py>ay goto outofrange
|
|
; computes the attenuation coeffs and the original point address
|
|
rlwinm r10,r2,6,28-6,31-6 ; r10 <- (r2 << 2) & 0x000002D0 (r10=(r2%16)*4*16)
|
|
cmpl cr4,0,r2,r3
|
|
rlwimi r10, r29, 2, 28-2, 31-2 ; r10 <- ((r29 << 2) & 0x0000002D) | (r10 & !0x0000002D) (r10=(r29%16)*4 | r10)
|
|
cmpl cr7,0,r29,r4
|
|
srawi r29,r29,4 ; pos computing
|
|
bge- cr4,L400
|
|
srawi r2,r2,4 ; pos computing
|
|
mullw r29, r29,r19 ; pos computing
|
|
bge- cr7,L400
|
|
|
|
; Channels notation : 00112233 (AARRVVBB)
|
|
|
|
add r2,r2,r29 ; pos computing
|
|
lwzx r10,r11,r10 ; Loads coefs
|
|
slwi r2,r2,2 ; pos computing
|
|
add r2,r2,r5 ; pos computing
|
|
rlwinm r21,r10,0,24,31 ; Isolates coef1 (??????11 -> 00000011)
|
|
lwz r25,0(r2) ; Loads col1 -> r25
|
|
lwz r26,4(r2) ; Loads col2 -> r26
|
|
rlwinm r22,r10,24,24,31 ; Isolates coef2 (????22?? -> 00000022)
|
|
rlwinm r23,r10,16,24,31 ; Isolates coef3 (??33???? -> 00000033)
|
|
add r2,r2,r20 ; Adds one line for future load of col3 and col4
|
|
and r8, r25,r12 ; Masks col1 channels 1 & 3 : 0x00XX00XX
|
|
rlwinm r24,r10,8,24,31 ; Isolates coef4 (44?????? -> 00000044)
|
|
dst r2,r17,2
|
|
rlwinm r25,r25,0,16,23 ; Masks col1 channel 2 : 0x0000XX00
|
|
;andi. r25,r25,0xFF00 ; Masks col1 channel 2 : 0x0000XX00
|
|
mullw r8, r8, r21 ; Applies coef1 on col1 channels 1 & 3
|
|
|
|
|
|
; computes final pixel color
|
|
and r10,r26,r12 ; Masks col2 channels 1 & 3 : 0x00XX00XX
|
|
lwz r27,0(r2) ; Loads col3 -> r27
|
|
mullw r10,r10,r22 ; Applies coef2 on col2 channels 1 & 3
|
|
mullw r25,r25,r21 ; Applies coef1 on col1 channel 2
|
|
rlwinm r29,r26,0,16,23 ; Masks col2 channel 2 : 0x0000XX00
|
|
;andi. r29,r26,0xFF00 ; Masks col2 channel 2 : 0x0000XX00
|
|
mullw r29,r29,r22 ; Applies coef2 on col2 channel 2
|
|
lwz r28,4(r2) ; Loads col4 -> r28
|
|
add r8 ,r8 ,r10 ; Adds col1 & col2 channels 1 & 3
|
|
and r10,r27,r12 ; Masks col3 channels 1 & 3 : 0x00XX00XX
|
|
add r25,r25,r29 ; Adds col1 & col2 channel 2
|
|
mullw r10,r10,r23 ; Applies coef3 on col3 channels 1 & 3
|
|
rlwinm r29,r27,0,16,23 ; Masks col3 channel 2 : 0x0000XX00
|
|
;andi. r29,r27,0xFF00 ; Masks col3 channel 2 : 0x0000XX00
|
|
mullw r29,r29,r23 ; Applies coef3 on col3 channel 2
|
|
lwz r2,0(r31) ; px
|
|
add r7 ,r8 ,r10 ; Adds col3 to (col1 + col2) channels 1 & 3
|
|
and r10,r28,r12 ; Masks col4 channels 1 & 3 : 0x00XX00XX
|
|
mullw r10,r10,r24 ; Applies coef4 on col4 channels 1 & 3
|
|
add r25,r25,r29 ; Adds col 3 to (col1 + col2) channel 2
|
|
lwz r8,0(r30) ; px2
|
|
rlwinm r28,r28,0,16,23 ; Masks col4 channel 2 : 0x0000XX00
|
|
;andi. r28,r28,0xFF00 ; Masks col4 channel 2 : 0x0000XX00
|
|
add r7 ,r7 ,r10 ; Adds col4 to (col1 + col2 + col3) channels 1 & 3
|
|
lwz r10,4(r30) ; py2
|
|
mullw r28,r28,r24 ; Applies coef4 on col4 channel 2
|
|
srawi r7, r7, 8 ; (sum of channels 1 & 3) >> 8
|
|
lwz r29,4(r31) ; py
|
|
add r25,r25,r28 ; Adds col 4 to (col1 + col2 + col3) channel 2
|
|
rlwimi r7, r25, 24, 16, 23 ; (((sum of channels 2) >> 8 ) & 0x0000FF00) | ((sum of channels 1 and 3) & 0xFFFF00FF)
|
|
stw r7,0(r6) ; Stores the computed pixel
|
|
bdnz L100 ; Iterate again if needed
|
|
b L300 ;goto end ; If not, returns from the function
|
|
|
|
|
|
; if out of range
|
|
L400:
|
|
stw r18,0(r6)
|
|
lwz r8,0(r30) ; px2
|
|
lwz r10,4(r30) ; py2
|
|
lwz r2,0(r31) ; px
|
|
lwz r29,4(r31) ; py
|
|
bdnz L100
|
|
|
|
|
|
L300:
|
|
|
|
; Restore saved registers and return
|
|
lmw r17,-60(r1)
|
|
blr
|