audio-resampler: Update NEON to handle remainders not multiples of 4

If the remainder is not evenly divisable by 4, we'd miss the check for zero and continue the loop until crashing. Change the branch to take into account negatives as well. This more closely matches the SSE loop.
2025-02-17 11:45:25 +00:00 · 2019-09-02 23:25:39 -04:00 · 2019-09-02 23:25:39 -04:00 · 0c955c16ce
commit 0c955c16ce
parent 909baa2360
1 changed files with 4 additions and 4 deletions
--- a/gst-libs/gst/audio/audio-resampler-neon.h
+++ b/gst-libs/gst/audio/audio-resampler-neon.h
@ -46,7 +46,7 @@ inner_product_gint16_full_1_neon (gint16 * o, const gint16 * a,
                  "      vld1.16 {d20}, [%[a]]!\n"
                  "      subs %[remainder], %[remainder], #4\n"
                  "      vmlal.s16 q0, d16, d20\n"
-                  "      bne 3b\n"
+                  "      bgt 3b\n"
                  "4:"
                  "      vadd.s32 d0, d0, d1\n"
                  "      vpadd.s32 d0, d0, d0\n"
@ -97,7 +97,7 @@ inner_product_gint16_linear_1_neon (gint16 * o, const gint16 * a,
                  "      subs %[remainder], %[remainder], #4\n"
                  "      vmlal.s16 q0, d16, d24\n"
                  "      vmlal.s16 q1, d20, d24\n"
-                  "      bne 3b\n"
+                  "      bgt 3b\n"
                  "4:"
                  "      vld2.16 {d20[], d21[]}, [%[ic]]\n"
                  "      vshrn.s32 d0, q0, #15\n"
@ -272,7 +272,7 @@ inner_product_gint32_full_1_neon (gint32 * o, const gint32 * a,
                  "      subs %[remainder], %[remainder], #4\n"
                  "      vmlal.s32 q0, d16, d20\n"
                  "      vmlal.s32 q0, d17, d21\n"
-                  "      bne 3b\n"
+                  "      bgt 3b\n"
                  "4:"
                  "      vadd.s64 d0, d0, d1\n"
                  "      vqrshrn.s64 d0, q0, #31\n"
@ -487,7 +487,7 @@ inner_product_gfloat_full_1_neon (gfloat * o, const gfloat * a,
                  "      vld1.32 {q10}, [%[a]]!\n"
                  "      subs %[remainder], %[remainder], #4\n"
                  "      vmla.f32 q0, q6, q10\n"
-                  "      bne 3b\n"
+                  "      bgt 3b\n"
                  "4:"
                  "      vadd.f32 d0, d0, d1\n"
                  "      vpadd.f32 d0, d0, d0\n"