Decoding binary protocols in python
Decoding binary protocols like the MySQL Client/Server Protocol or MySQL's new X Protocol involves taking a sequence of bytes and turning them into integers.
In python the usually workhorse for this task is struct.unpack()
It takes a sequence of bytes and a format-string and returns a tuple of decoded values.
In the case of the MySQL Client/Server protocol the integers are (mostly) little-endian, unsigned and we can use:
format | description |
---|---|
<B | integer, little endian, unsigned, 1 bytes |
<H | integer, little endian, unsigned, 2 bytes |
<L | integer, little endian, unsigned, 4 bytes |
<Q | integer, little endian, unsigned, 8 bytes |
# unpack_int_le_1
import struct
def unpack_int_le_struct(payload, l):
if l == 0:
return 0
elif l == 1:
return struct.unpack_from("<B", payload)[0]
elif l == 2:
return struct.unpack_from("<H", payload)[0]
elif l == 4:
return struct.unpack_from("<L", payload)[0]
elif l == 8:
return struct.unpack_from("<Q", payload)[0]
else:
# no native mapping for that byte-length,
# fallback to shift+or
v = 0
sh = 0
for d in struct.unpack_from("%dB" % l, payload):
v |= d << sh
sh += 8
return v
The code gets benchmarked with 'timeit' which runs the setup once and the test-function 10000 times:
# benchmark.py
import timeit
import tabulate
from collections import OrderedDict
setup = r"""
from unpack_int_le_1 import unpack_int_le
test_str = b'\x00\x00\x00\x00\x00\x00\x00\x00'
"""
timings = OrderedDict()
for n in (1, 2, 3, 4, 8):
key = str(n)
timings[key] = timeit.timeit(
r"unpack_int_le(test_str, %d)" % (n, ),
setup=setup)
print(tabulate.tabulate(
timings.items(),
tablefmt="rst",
headers=("byte length",
"(s)",
)))
shows us:
byte length | time (s) |
---|---|
1 | 0.466556 |
2 | 0.422064 |
3 | 1.17804 |
4 | 0.439113 |
8 | 0.448069 |
As the MySQL Client/Server protocol also has a 3-byte unsigned integer which doesn't have mapping struct's format strings, the fallback is taken which is quite slow.
Optimizing 3 byte integers
There is a nice trick to speed it up:
uint3_le = struct.unpack("<L", payload[:3] + b"\x00")[0]
# unpack_int_le_2.py
import struct
def unpack_int_le(payload, l):
if l == 0:
return 0
elif l == 1:
return struct.unpack_from("<B", payload)[0]
elif l == 2:
return struct.unpack_from("<H", payload)[0]
elif l == 3 and hasattr(payload, "__add__"):
return struct.unpack_from("<L", payload[:l] + b"\x00")[0]
elif l == 4:
return struct.unpack_from("<L", payload)[0]
elif l == 8:
return struct.unpack_from("<Q", payload)[0]
else:
# no native mapping for that byte-length, fallback to
# shift+or
v = 0
sh = 0
for d in struct.unpack_from("%dB" % l, payload):
v |= d << sh
sh += 8
return v
Note
the check for __add__ handles that case of memoryviews which can't be extended like that.
byte length | time (s) |
---|---|
1 | 0.479504 |
2 | 0.430233 |
3 | 0.650036 |
4 | 0.456068 |
8 | 0.481057 |
The tiny bit of extra work to slice and extend the string, wins over doing all the work in pure python.
Naive cython
cython is a python to C converter which (with the help of a C compiler) builds native python modules.
You can install it via pip:
$ pip install cython
and generate a native python module with:
$ cythonize ./unpack_int_le_2.py && \
gcc -shared -pthread -fPIC -fwrapv -O3 -Wall \
-fno-strict-aliasing -I/usr/include/python2.7 \
-o unpack_int_le_3.so unpack_int_le_3.c
Without any changes to the python code we gain between 20-40%:
byte length | time (s) |
---|---|
1 | 0.36877 |
2 | 0.314847 |
3 | 0.427502 |
4 | 0.322987 |
8 | 0.334282 |
Static typed python with annotations
cython can do better though with static typing.
Looking at the generated unpack_int_le_3.c one can see that most of the time is spent in the abstractions (object, buffer-interface, ...) for each byte extracted.
cython can remove all that if we give it some hints:
# unpack_int_le_4.py
import cython
@cython.locals(payload=cython.p_uchar,
l=cython.ulong,
v=cython.ulong,
sh=cython.ulong,
n=cython.ulong)
def _unpack_int_le(payload, l):
v = sh = 0
for n in range(l):
v |= payload[n] << sh
sh += 8
return v
@cython.locals(l=cython.ulong,
charp=cython.p_uchar)
def unpack_int_le(payload, l):
if l > len(payload):
raise IndexError()
if isinstance(payload, (bytes, bytearray)):
charp = payload
return _unpack_int_le(charp, l)
else:
raise TypeError(type(payload))
See also: http://docs.cython.org/en/latest/src/tutorial/pure.html
With these hints given the _unpack_int_le() from above gets translated to:
__pyx_v_v = 0;
__pyx_v_sh = 0;
__pyx_t_1 = __pyx_v_l;
for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) {
__pyx_v_n = __pyx_t_2;
__pyx_v_v = (__pyx_v_v | ((__pyx_v_payload[__pyx_v_n]) << __pyx_v_sh));
__pyx_v_sh = (__pyx_v_sh + 8);
}
return PyInt_from_unsigned_long(__pyx_v_v)
This gains another 50%:
byte length | time (s) |
---|---|
1 | 0.224607 |
2 | 0.157248 |
3 | 0.158877 |
4 | 0.163004 |
8 | 0.165769 |
While the code is pure python with some annotations it sadly comes with some overhead in the generated C-code.
Static typed cython
cython has two more ways of specifying the static types:
- move the static types into a .pxd file
- write a .pyx file directly
In our case the result would be the same which is why I go with the first approach:
# unpack_int_le_5.pxd
import cython
@cython.locals(v=cython.ulong,
sh=cython.ulong,
n=cython.ulong)
cdef unsigned long _unpack_int_le(unsigned char *payload, unsigned long l)
@cython.locals(l=cython.ulong,
charp=cython.p_uchar)
cpdef unpack_int_le(payload, unsigned long l)
# unpack_int_le_5.py
def _unpack_int_le(payload, l):
v = sh = n = 0
for n in range(l):
v |= payload[n] << sh
sh += 8
return v
def unpack_int_le(payload, l):
if l > len(payload):
raise IndexError()
if isinstance(payload, (bytes, bytearray)):
charp = payload
return _unpack_int_le(charp, l)
else:
raise TypeError(type(payload))
$ cythonize ./unpack_int_le_5.py && \
gcc -shared -pthread -fPIC -fwrapv \
-O3 -Wall -fno-strict-aliasing \
-I/usr/include/python2.7 \
-o unpack_int_le_5.so unpack_int_le_5.c
byte length | time (s) |
---|---|
1 | 0.084414 |
2 | 0.0818369 |
3 | 0.083369 |
4 | 0.084229 |
8 | 0.0980709 |
One can see that the longer byte-length leads to slightly higher runtime.
Just cast it
One last step and we are at the final form of the optimizations which mirror what one would have written if one would have written the code in C directly.
If we are running on a little endian platform, we can save shift-or-loop and just cast the byte-sequence into the right integer directly.
Note
to show how the code looks in a pyx file, the previous py and pxd files have been merged into a pyx.
# unpack_int_le_6.pyx
import sys
cdef unsigned int is_le
is_le = sys.byteorder == "little"
cdef unsigned long _unpack_int_le(const unsigned char *payload, unsigned long l):
cdef unsigned long *u32p
cdef unsigned short *u16p
cdef unsigned long long *u64p
cdef unsigned long v = 0
cdef unsigned long sh = 0
cdef unsigned long n = 0
if is_le:
if l == 1:
return payload[0]
elif l == 2:
u16p = <unsigned short *>payload
return u16p[0]
elif l == 4:
u32p = <unsigned long *>payload
return u32p[0]
elif l == 8:
u64p = <unsigned long long *>payload
return u64p[0]
v = sh = n = 0
for n in range(l):
v |= payload[n] << sh
sh += 8
return v
cpdef unsigned long unpack_int_le(payload, unsigned long l):
cdef unsigned char *charp
if l > len(payload):
raise IndexError()
if isinstance(payload, (bytes, bytearray)):
charp = payload
return _unpack_int_le(charp, l)
else:
raise TypeError(type(payload))
$ cythonize ./unpack_int_le_6.pyx && \
gcc -shared -pthread -fPIC -fwrapv \
-O3 -Wall -fno-strict-aliasing \
-I/usr/include/python2.7 \
-o unpack_int_le_6.so unpack_int_le_6.c
byte length | time (s) |
---|---|
1 | 0.0812111 |
2 | 0.0792191 |
3 | 0.082288 |
4 | 0.079355 |
8 | 0.0791218 |
Note
the 3-byte case hits the loop again and is slightly slower.
Conclusion
Between the first attempt with struct.unpack() and our last with cython we have quite a speed difference:
byte length | time (s) | time (s) | speedup |
---|---|---|---|
1 | 0.466556 | 0.0812111 | 5.75x |
2 | 0.422064 | 0.0792191 | 5.34x |
3 | 1.17804 | 0.082288 | 14.36x |
4 | 0.439113 | 0.079355 | 5.56x |
8 | 0.448069 | 0.0791218 | 5.67x |
We got a 5x speed up and didn't had to write a single line of python C-API code.
Comments