Speeding up protocol decoders in python

Decoding binary protocols in python

Decoding binary protocols like the MySQL Client/Server Protocol or MySQL's new X Protocol involves taking a sequence of bytes and turning them into integers.

In python the usually workhorse for this task is struct.unpack()

It takes a sequence of bytes and a format-string and returns a tuple of decoded values.

In the case of the MySQL Client/Server protocol the integers are (mostly) little-endian, unsigned and we can use:

format	description
<B	integer, little endian, unsigned, 1 bytes
<H	integer, little endian, unsigned, 2 bytes
<L	integer, little endian, unsigned, 4 bytes
<Q	integer, little endian, unsigned, 8 bytes

# unpack_int_le_1

import struct


def unpack_int_le_struct(payload, l):
    if l == 0:
        return 0
    elif l == 1:
        return struct.unpack_from("<B", payload)[0]
    elif l == 2:
        return struct.unpack_from("<H", payload)[0]
    elif l == 4:
        return struct.unpack_from("<L", payload)[0]
    elif l == 8:
        return struct.unpack_from("<Q", payload)[0]
    else:
        # no native mapping for that byte-length,
        # fallback to shift+or
        v = 0
        sh = 0

        for d in struct.unpack_from("%dB" % l, payload):
            v |= d << sh
            sh += 8

        return v

The code gets benchmarked with 'timeit' which runs the setup once and the test-function 10000 times:

# benchmark.py

import timeit
import tabulate

from collections import OrderedDict

setup = r"""
from unpack_int_le_1 import unpack_int_le

test_str = b'\x00\x00\x00\x00\x00\x00\x00\x00'
"""


timings = OrderedDict()

for n in (1, 2, 3, 4, 8):
    key = str(n)

    timings[key] = timeit.timeit(
        r"unpack_int_le(test_str, %d)" % (n, ),
        setup=setup)


print(tabulate.tabulate(
    timings.items(),
    tablefmt="rst",
    headers=("byte length",
             "(s)",
             )))

shows us:

byte length	time (s)
1	0.466556
2	0.422064
3	1.17804
4	0.439113
8	0.448069

As the MySQL Client/Server protocol also has a 3-byte unsigned integer which doesn't have mapping struct's format strings, the fallback is taken which is quite slow.

Optimizing 3 byte integers

There is a nice trick to speed it up:

uint3_le = struct.unpack("<L", payload[:3] + b"\x00")[0]

# unpack_int_le_2.py

import struct


def unpack_int_le(payload, l):
    if l == 0:
        return 0
    elif l == 1:
        return struct.unpack_from("<B", payload)[0]
    elif l == 2:
        return struct.unpack_from("<H", payload)[0]
    elif l == 3 and hasattr(payload, "__add__"):
        return struct.unpack_from("<L", payload[:l] + b"\x00")[0]
    elif l == 4:
        return struct.unpack_from("<L", payload)[0]
    elif l == 8:
        return struct.unpack_from("<Q", payload)[0]
    else:
        # no native mapping for that byte-length, fallback to
        # shift+or
        v = 0
        sh = 0

        for d in struct.unpack_from("%dB" % l, payload):
            v |= d << sh
            sh += 8

        return v

Note

the check for __add__ handles that case of memoryviews which can't be extended like that.

byte length	time (s)
1	0.479504
2	0.430233
3	0.650036
4	0.456068
8	0.481057

The tiny bit of extra work to slice and extend the string, wins over doing all the work in pure python.

Naive cython

cython is a python to C converter which (with the help of a C compiler) builds native python modules.

You can install it via pip:

$ pip install cython

and generate a native python module with:

$ cythonize ./unpack_int_le_2.py && \
  gcc -shared -pthread -fPIC -fwrapv -O3 -Wall \
    -fno-strict-aliasing -I/usr/include/python2.7 \
    -o unpack_int_le_3.so unpack_int_le_3.c

Without any changes to the python code we gain between 20-40%:

byte length	time (s)
1	0.36877
2	0.314847
3	0.427502
4	0.322987
8	0.334282

Static typed python with annotations

cython can do better though with static typing.

Looking at the generated unpack_int_le_3.c one can see that most of the time is spent in the abstractions (object, buffer-interface, ...) for each byte extracted.

cython can remove all that if we give it some hints:

# unpack_int_le_4.py

import cython


@cython.locals(payload=cython.p_uchar,
               l=cython.ulong,
               v=cython.ulong,
               sh=cython.ulong,
               n=cython.ulong)
def _unpack_int_le(payload, l):
    v = sh = 0

    for n in range(l):
        v |= payload[n] << sh
        sh += 8

    return v


@cython.locals(l=cython.ulong,
               charp=cython.p_uchar)
def unpack_int_le(payload, l):
    if l > len(payload):
        raise IndexError()

    if isinstance(payload, (bytes, bytearray)):
        charp = payload

        return _unpack_int_le(charp, l)
    else:
        raise TypeError(type(payload))

With these hints given the _unpack_int_le() from above gets translated to:

__pyx_v_v = 0;
__pyx_v_sh = 0;

__pyx_t_1 = __pyx_v_l;
for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) {
  __pyx_v_n = __pyx_t_2;

  __pyx_v_v = (__pyx_v_v | ((__pyx_v_payload[__pyx_v_n]) << __pyx_v_sh));
  __pyx_v_sh = (__pyx_v_sh + 8);
}

return PyInt_from_unsigned_long(__pyx_v_v)

This gains another 50%:

byte length	time (s)
1	0.224607
2	0.157248
3	0.158877
4	0.163004
8	0.165769

While the code is pure python with some annotations it sadly comes with some overhead in the generated C-code.

Static typed cython

cython has two more ways of specifying the static types:

move the static types into a .pxd file
write a .pyx file directly

In our case the result would be the same which is why I go with the first approach:

# unpack_int_le_5.pxd

import cython


@cython.locals(v=cython.ulong,
               sh=cython.ulong,
               n=cython.ulong)
cdef unsigned long _unpack_int_le(unsigned char *payload, unsigned long l)

@cython.locals(l=cython.ulong,
               charp=cython.p_uchar)
cpdef unpack_int_le(payload, unsigned long l)

# unpack_int_le_5.py

def _unpack_int_le(payload, l):
    v = sh = n = 0

    for n in range(l):
        v |= payload[n] << sh
        sh += 8

    return v


def unpack_int_le(payload, l):
    if l > len(payload):
        raise IndexError()

    if isinstance(payload, (bytes, bytearray)):
        charp = payload

        return _unpack_int_le(charp, l)
    else:
        raise TypeError(type(payload))

$ cythonize ./unpack_int_le_5.py && \
  gcc -shared -pthread -fPIC -fwrapv \
    -O3 -Wall -fno-strict-aliasing \
    -I/usr/include/python2.7 \
    -o unpack_int_le_5.so unpack_int_le_5.c

byte length	time (s)
1	0.084414
2	0.0818369
3	0.083369
4	0.084229
8	0.0980709

One can see that the longer byte-length leads to slightly higher runtime.

Just cast it

One last step and we are at the final form of the optimizations which mirror what one would have written if one would have written the code in C directly.

If we are running on a little endian platform, we can save shift-or-loop and just cast the byte-sequence into the right integer directly.

Note

to show how the code looks in a pyx file, the previous py and pxd files have been merged into a pyx.

# unpack_int_le_6.pyx

import sys


cdef unsigned int is_le
is_le = sys.byteorder == "little"


cdef unsigned long _unpack_int_le(const unsigned char *payload, unsigned long l):
  cdef unsigned long *u32p
  cdef unsigned short *u16p
  cdef unsigned long long *u64p
  cdef unsigned long v = 0
  cdef unsigned long sh = 0
  cdef unsigned long n = 0

  if is_le:
      if l == 1:
          return payload[0]
      elif l == 2:
          u16p = <unsigned short *>payload
          return u16p[0]
      elif l == 4:
          u32p = <unsigned long *>payload
          return u32p[0]
      elif l == 8:
          u64p = <unsigned long long *>payload
          return u64p[0]

  v = sh = n = 0

  for n in range(l):
      v |= payload[n] << sh
      sh += 8

  return v


cpdef unsigned long unpack_int_le(payload, unsigned long l):
    cdef unsigned char *charp
    if l > len(payload):
        raise IndexError()

    if isinstance(payload, (bytes, bytearray)):
        charp = payload

        return _unpack_int_le(charp, l)
    else:
        raise TypeError(type(payload))

$ cythonize ./unpack_int_le_6.pyx && \
  gcc -shared -pthread -fPIC -fwrapv \
    -O3 -Wall -fno-strict-aliasing \
    -I/usr/include/python2.7 \
    -o unpack_int_le_6.so unpack_int_le_6.c

byte length	time (s)
1	0.0812111
2	0.0792191
3	0.082288
4	0.079355
8	0.0791218

Note

the 3-byte case hits the loop again and is slightly slower.

Conclusion

Between the first attempt with struct.unpack() and our last with cython we have quite a speed difference:

byte length	time (s)	time (s)	speedup
1	0.466556	0.0812111	5.75x
2	0.422064	0.0792191	5.34x
3	1.17804	0.082288	14.36x
4	0.439113	0.079355	5.56x
8	0.448069	0.0791218	5.67x

We got a 5x speed up and didn't had to write a single line of python C-API code.

Decoding binary protocols in python

Optimizing 3 byte integers

Naive cython

Static typed python with annotations

Static typed cython

Just cast it

Conclusion

Comments