diff --git a/gst/gstutils.c b/gst/gstutils.c
index f9155e1ed7..302e3f84fb 100644
--- a/gst/gstutils.c
+++ b/gst/gstutils.c
@@ -204,83 +204,19 @@ typedef union
   } l;
 } GstUInt64;
 
-/* based on Hacker's Delight p152 */
-static guint64
-gst_util_div128_64 (GstUInt64 c1, GstUInt64 c0, guint64 denom)
+/* multiply two 64-bit unsigned ints into a 128-bit unsigned int.  the high
+ * and low 64 bits of the product are placed in c1 and c0 respectively.
+ * this operation cannot overflow. */
+static void
+gst_util_uint64_mul_uint64 (GstUInt64 * c1, GstUInt64 * c0, guint64 arg1,
+    guint64 arg2)
 {
-  GstUInt64 q1, q0, rhat;
-  GstUInt64 v, cmp1, cmp2;
-  guint s;
-
-  v.ll = denom;
-
-  /* count number of leading zeroes, we know they must be in the high
-   * part of denom since denom > G_MAXUINT32. */
-  s = v.l.high | (v.l.high >> 1);
-  s |= (s >> 2);
-  s |= (s >> 4);
-  s |= (s >> 8);
-  s = ~(s | (s >> 16));
-  s = s - ((s >> 1) & 0x55555555);
-  s = (s & 0x33333333) + ((s >> 2) & 0x33333333);
-  s = (s + (s >> 4)) & 0x0f0f0f0f;
-  s += (s >> 8);
-  s = (s + (s >> 16)) & 0x3f;
-
-  if (s > 0) {
-    /* normalize divisor and dividend */
-    v.ll <<= s;
-    c1.ll = (c1.ll << s) | (c0.l.high >> (32 - s));
-    c0.ll <<= s;
-  }
-
-  q1.ll = c1.ll / v.l.high;
-  rhat.ll = c1.ll - q1.ll * v.l.high;
-
-  cmp1.l.high = rhat.l.low;
-  cmp1.l.low = c0.l.high;
-  cmp2.ll = q1.ll * v.l.low;
-
-  while (q1.l.high || cmp2.ll > cmp1.ll) {
-    q1.ll--;
-    rhat.ll += v.l.high;
-    if (rhat.l.high)
-      break;
-    cmp1.l.high = rhat.l.low;
-    cmp2.ll -= v.l.low;
-  }
-  c1.l.high = c1.l.low;
-  c1.l.low = c0.l.high;
-  c1.ll -= q1.ll * v.ll;
-  q0.ll = c1.ll / v.l.high;
-  rhat.ll = c1.ll - q0.ll * v.l.high;
-
-  cmp1.l.high = rhat.l.low;
-  cmp1.l.low = c0.l.low;
-  cmp2.ll = q0.ll * v.l.low;
-
-  while (q0.l.high || cmp2.ll > cmp1.ll) {
-    q0.ll--;
-    rhat.ll += v.l.high;
-    if (rhat.l.high)
-      break;
-    cmp1.l.high = rhat.l.low;
-    cmp2.ll -= v.l.low;
-  }
-  q0.l.high += q1.l.low;
-
-  return q0.ll;
-}
-
-static guint64
-gst_util_uint64_scale_int64_unchecked (guint64 val, guint64 num, guint64 denom)
-{
-  GstUInt64 a0, a1, b0, b1, c0, ct, c1, result;
+  GstUInt64 a1, b0;
   GstUInt64 v, n;
 
   /* prepare input */
-  v.ll = val;
-  n.ll = num;
+  v.ll = arg1;
+  n.ll = arg2;
 
   /* do 128 bits multiply
    *                   nh   nl
@@ -291,79 +227,169 @@ gst_util_uint64_scale_int64_unchecked (guint64 val, guint64 num, guint64 denom)
    * b0 =         vh * nl
    * b1 =  + vh * nh
    *       -------------------
-   * c1,c0
+   *        c1h  c1l  c0h  c0l
+   *
+   * "a0" is optimized away, result is stored directly in c0.  "b1" is
+   * optimized away, result is stored directly in c1.
    */
-  a0.ll = (guint64) v.l.low * n.l.low;
+  c0->ll = (guint64) v.l.low * n.l.low;
   a1.ll = (guint64) v.l.low * n.l.high;
   b0.ll = (guint64) v.l.high * n.l.low;
-  b1.ll = (guint64) v.l.high * n.l.high;
 
-  /* and sum together with carry into 128 bits c1, c0 */
-  c0.l.low = a0.l.low;
-  ct.ll = (guint64) a0.l.high + a1.l.low + b0.l.low;
-  c0.l.high = ct.l.low;
-  c1.ll = (guint64) a1.l.high + b0.l.high + ct.l.high + b1.ll;
+  /* add the high word of a0 to the low words of a1 and b0 using c1 as
+   * scrach space to capture the carry.  the low word of the result becomes
+   * the final high word of c0 */
+  c1->ll = (guint64) c0->l.high + a1.l.low + b0.l.low;
+  c0->l.high = c1->l.low;
 
-  /* if high bits bigger than denom, we overflow */
-  if (G_UNLIKELY (c1.ll >= denom))
-    goto overflow;
+  /* add the carry from the result above (found in the high word of c1) and
+   * the high words of a1 and b0 to b1, the result is c1. */
+  c1->ll = (guint64) v.l.high * n.l.high + c1->l.high + a1.l.high + b0.l.high;
+}
 
-  /* shortcut for division by 1, c1.ll should be 0 because of the
-   * overflow check above. */
-  if (denom == 1)
-    return c0.ll;
-
-  /* and 128/64 bits division, result fits 64 bits */
-  if (denom <= G_MAXUINT32) {
-    guint32 den = (guint32) denom;
-
-    /* easy case, (c1,c0)128/(den)32 division */
-    c1.l.high %= den;
-    c1.l.high = c1.ll % den;
-    c1.l.low = c0.l.high;
-    c0.l.high = c1.ll % den;
-    result.l.high = c1.ll / den;
-    result.l.low = c0.ll / den;
-  } else {
-    result.ll = gst_util_div128_64 (c1, c0, denom);
+/* compute the quotient and remainder of 2^64 / d.  returns 0 if the
+ * quotient overflows (meaning d = 1). */
+static guint64
+gst_util_two_to_the_64_over_d (guint64 d, guint64 * remainder)
+{
+  guint64 quotient = G_MAXUINT64 / d;
+  *remainder = G_MAXUINT64 % d + 1;
+  if (*remainder == d) {
+    quotient++;
+    *remainder = 0;
   }
-  return result.ll;
+  return quotient;
+}
 
-overflow:
-  {
+/* divide a 128-bit unsigned int by a 64-bit unsigned int when we know the
+ * quotient fits into 64 bits. */
+static guint64
+gst_util_div128_64 (guint64 c1, guint64 c0, guint64 denom, guint64 * remainder)
+{
+  /* we are trying to compute
+   *
+   * c1 * 2^64 + c0
+   * --------------
+   *       d
+   *
+   * this can be re-written as:
+   *
+   * c1 * 2^64 + c0        2^64   c0
+   * -------------- = c1 * ---- + --
+   *       d                d      d
+   *
+   *                       (            2^64 % d )   c0
+   *                = c1 * (2^64 // d + ---------) + --
+   *                       (                d    )    d
+   *
+   *                                     c1 * (2^64 % d) + c0
+   *                = c1 * (2^64 // d) + --------------------
+   *                                              d
+   *
+   * where "//" indicates the integer quotient and "%" indicates remainder.
+   * note that 2^64 // d != 0 because d fits in 64 bits, and therefore if
+   * c1 != 0 the first term on the right-hand-side is != 0 and therefore
+   * the numerator in the fraction on the right-hand-side must be less than
+   * the numerator in the fraction on the left-hand-side.
+   *
+   * this provides us with an algorithm to compute both the quotient and
+   * remainder iteratively --- essentially a base-2^64 version of long
+   * division.  initializing the quotient to 0, the first term on the
+   * right-hand side is computed and added to the quotient (this can't
+   * overflow because we know the final answer fits in 64 bits).  the
+   * numerator of the second term is then computed and the high and low
+   * words stored in c1 and c0 respectively.  this is repeated until c1 is
+   * 0, at which point the problem has been reduced to computing the
+   * quotient and remainder of a 64-bit unsigned integer (c0) divided by a
+   * 64-bit unsigned integer (denom) which can be completed using regular
+   * integer arithmetic operations.
+   *
+   * note that gst_util_two_to_the_64_over_d() returns 0 if that quotient
+   * overflows.  this can only happen if d = 1, but because we know that
+   * our quotient must fit into 64 bits c1 must be 0 when d = 1, so the
+   * algorithm produces the correct result.
+   */
+
+  guint64 quotient = 0;
+
+  while (c1) {
+    guint64 a;
+    /* add c1 * (2^64 // d) to quotient, store 2^64 % d in a */
+    quotient += c1 * gst_util_two_to_the_64_over_d (denom, &a);
+    /* store the high and low words of c1 * (2^64 % d) in c1 and a
+     * respectively */
+    gst_util_uint64_mul_uint64 ((GstUInt64 *) & c1, (GstUInt64 *) & a, c1, a);
+    /* add a to c0, with a carry into c1 if the result rolls over */
+    if (G_MAXUINT64 - c0 < a)
+      c1++;
+    c0 += a;
+  }
+
+  /* c1 is 0.  use regular integer arithmetic with c0 to complete result */
+  *remainder = c0 % denom;
+  return quotient + c0 / denom;
+}
+
+/* multiply a 64-bit unsigned int by a 32-bit unsigned int into a 96-bit
+ * unsigned int.  the high 64 bits and low 32 bits of the product are
+ * placed in c1 and c0 respectively.  this operation cannot overflow. */
+static void
+gst_util_uint64_mul_uint32 (GstUInt64 * c1, GstUInt64 * c0, guint64 arg1,
+    guint32 arg2)
+{
+  GstUInt64 a;
+
+  a.ll = arg1;
+
+  c0->ll = (guint64) a.l.low * arg2;
+  c1->ll = (guint64) a.l.high * arg2 + c0->l.high;
+  c0->l.high = 0;
+}
+
+/* divide a 96-bit unsigned int by a 32-bit unsigned int when we know the
+ * quotient fits into 64 bits.  the high 64 bits and low 32 bits of the
+ * numerator are expected in c1 and c0 respectively. */
+static guint64
+gst_util_div96_32 (guint64 c1, guint64 c0, guint32 denom, guint32 * remainder)
+{
+  c0 += (c1 % denom) << 32;
+  *remainder = c0 % denom;
+  return ((c1 / denom) << 32) + (c0 / denom);
+}
+
+static guint64
+gst_util_uint64_scale_uint64_unchecked (guint64 val, guint64 num,
+    guint64 denom, guint64 * remainder)
+{
+  guint64 c1, c0;
+
+  /* compute 128-bit numerator product */
+  gst_util_uint64_mul_uint64 ((GstUInt64 *) & c1, (GstUInt64 *) & c0, val, num);
+
+  /* high word as big as or bigger than denom --> overflow */
+  if (G_UNLIKELY (c1 >= denom))
     return G_MAXUINT64;
-  }
+
+  /* compute quotient, fits in 64 bits */
+  return gst_util_div128_64 (c1, c0, denom, remainder);
 }
 
 static inline guint64
-gst_util_uint64_scale_int_unchecked (guint64 val, gint num, gint denom)
+gst_util_uint64_scale_uint32_unchecked (guint64 val, guint32 num,
+    guint32 denom, guint32 * remainder)
 {
-  GstUInt64 result;
-  GstUInt64 low, high;
+  GstUInt64 c1, c0;
 
-  /* do 96 bits mult/div */
-  low.ll = val;
-  result.ll = ((guint64) low.l.low) * num;
-  high.ll = ((guint64) low.l.high) * num + (result.l.high);
+  /* compute 96-bit numerator product */
+  gst_util_uint64_mul_uint32 (&c1, &c0, val, num);
 
-  low.ll = high.ll / denom;
-  result.l.high = high.ll % denom;
-  result.ll /= denom;
-
-  /* avoid overflow */
-  if (G_UNLIKELY (low.ll + result.l.high > G_MAXUINT32))
-    goto overflow;
-
-  result.l.high += low.l.low;
-
-  return result.ll;
-
-overflow:
-  {
+  /* high 32 bits as big as or bigger than denom --> overflow */
+  if (G_UNLIKELY (c1.l.high >= denom))
     return G_MAXUINT64;
-  }
-}
 
+  /* compute quotient, fits in 64 bits */
+  return gst_util_div96_32 (c1.ll, c0.ll, denom, remainder);
+}
 
 /**
  * gst_util_uint64_scale:
@@ -371,16 +397,19 @@ overflow:
  * @num: the numerator of the scale ratio
  * @denom: the denominator of the scale ratio
  *
- * Scale @val by @num / @denom, trying to avoid overflows.
+ * Scale @val by the rational number @num / @denom, avoiding overflows and
+ * underflows and without loss of precision.
  *
- * This function can potentially be very slow if denom > G_MAXUINT32.
+ * This function can potentially be very slow if val and num are both
+ * greater than G_MAXUINT32.
  *
- * Returns: @val * @num / @denom, trying to avoid overflows.
- * In the case of an overflow, this function returns G_MAXUINT64.
+ * Returns: @val * @num / @denom.  In the case of an overflow, this
+ * function returns G_MAXUINT64.
  */
 guint64
 gst_util_uint64_scale (guint64 val, guint64 num, guint64 denom)
 {
+  guint64 remainder;
   g_return_val_if_fail (denom != 0, G_MAXUINT64);
 
   if (G_UNLIKELY (num == 0))
@@ -389,27 +418,22 @@ gst_util_uint64_scale (guint64 val, guint64 num, guint64 denom)
   if (G_UNLIKELY (num == denom))
     return val;
 
-  /* if the denom is high, we need to do a 64 muldiv */
-  if (G_UNLIKELY (denom > G_MAXINT32))
-    goto do_int64;
+  /* deneom is low --> try to use 96 bit muldiv */
+  if (G_LIKELY (denom <= G_MAXUINT32)) {
+    guint32 remainder;
+    /* num is low --> use 96 bit muldiv */
+    if (G_LIKELY (num <= G_MAXUINT32))
+      return gst_util_uint64_scale_uint32_unchecked (val, (guint32) num,
+          (guint32) denom, &remainder);
 
-  /* if num and denom are low we can do a 32 bit muldiv */
-  if (G_LIKELY (num <= G_MAXINT32))
-    goto do_int32;
+    /* num is high but val is low --> swap and use 96-bit muldiv */
+    if (G_LIKELY (val <= G_MAXUINT32))
+      return gst_util_uint64_scale_uint32_unchecked (num, (guint32) val,
+          (guint32) denom, &remainder);
+  }
 
-  /* val and num are high, we need 64 muldiv */
-  if (G_UNLIKELY (val > G_MAXINT32))
-    goto do_int64;
-
-  /* val is low and num is high, we can swap them and do 32 muldiv */
-  return gst_util_uint64_scale_int_unchecked (num, (gint) val, (gint) denom);
-
-do_int32:
-  return gst_util_uint64_scale_int_unchecked (val, (gint) num, (gint) denom);
-
-do_int64:
-  /* to the more heavy implementations... */
-  return gst_util_uint64_scale_int64_unchecked (val, num, denom);
+  /* val is high and num is high --> use 128-bit muldiv */
+  return gst_util_uint64_scale_uint64_unchecked (val, num, denom, &remainder);
 }
 
 /**
@@ -418,17 +442,17 @@ do_int64:
  * @num: numerator of the scale factor.
  * @denom: denominator of the scale factor.
  *
- * Scale a guint64 by a factor expressed as a fraction (num/denom), avoiding
- * overflows and loss of precision.
+ * Scale @val by the rational number @num / @denom, avoiding overflows and
+ * underflows and without loss of precision.  @num must be non-negative and
+ * @denom must be positive.
  *
- * @num and @denom must be positive integers. @denom cannot be 0.
- *
- * Returns: @val * @num / @denom, avoiding overflow and loss of precision.
- * In the case of an overflow, this function returns G_MAXUINT64.
+ * Returns: @val * @num / @denom.  In the case of an overflow, this
+ * function returns G_MAXUINT64.
  */
 guint64
 gst_util_uint64_scale_int (guint64 val, gint num, gint denom)
 {
+  guint32 remainder;
   g_return_val_if_fail (denom > 0, G_MAXUINT64);
   g_return_val_if_fail (num >= 0, G_MAXUINT64);
 
@@ -438,11 +462,16 @@ gst_util_uint64_scale_int (guint64 val, gint num, gint denom)
   if (G_UNLIKELY (num == denom))
     return val;
 
-  if (val <= G_MAXUINT32)
-    /* simple case */
-    return val * num / denom;
+  if (val <= G_MAXUINT32) {
+    /* simple case, use two statements to prevent optimizer from screwing
+     * up result.  num and denom are not negative so casts are OK */
+    val *= (guint64) num;
+    return val / (guint64) denom;
+  }
 
-  return gst_util_uint64_scale_int_unchecked (val, num, denom);
+  /* num and denom are not negative so casts are OK */
+  return gst_util_uint64_scale_uint32_unchecked (val, (guint32) num,
+      (guint32) denom, &remainder);
 }
 
 /**