)����w.Mٿ��B N���~+TѮ��&j(߉۹W������,$��i���o[U�u�[Y�c���"���5vF�}`j3��ux�o&����Q�&��!�CD`��
BM�;{�t��Y��/ov���,�����ʀ��Z�xϬ���W�D�P�z�И�>7˔�DY�A������rq�z>i���#YPwONgJ�N0�9R�uUNCev��!��4�
ޘ#��q�N��\�
.2�76&W��S_��1����=ZHJBԠ^�%�����1�T
9��
���Su�6)�8|u�%�sԨ&�Dl�f��:f����ȩ.m���}���s���rT�.������E�T��^�Q��ʧ���g��uϓ�f@�xp����&ȃS�Ǜ�a��(٥�����xMmN����C�����}odŮU�}�!���,�PN��ҭCR�y�O���7�f��W�/���sha����`���|>j)��ធ�QL��O10�鋋3�JMܵut�"l'R�<&����5_��nGJse͖���'� �B�6\����^��H�x#��Ӑg�3'�u�O�?W��2X�b�����`�`s�ؠ��~n6:�H�y��@٭|/�����*�L/B�r�t��+���y�J3ʥf���t&�1�EG�X��~|u�=hLֽ}����x�tM;њ~���� L����������&����al8驠�Ϻ�Hr�;�*�TL��I�3��N!wP�
��ܮ��G���UB�Ο�1�$d��ٮ�@��ϋ��*EHrk�c<���c�+y�Amf�sg�I��X-���Q�F���؟�:Lw&}l﮿�=k2Z�BPU� R�^�Mn��̇���J5[C`h͊��:����0���/&[�+?�.�Y�)xgQ���ӊx(3S��E�ReU�;�_�'�_��C�ef���S䅥Z�e�pՆq���N�{�[M��R2�V��4`�3���^�(�ͿaW93��M�<��Z�k�{��obB��fuZ�䶑�n���]�<�Ye='���%�-�Mo+�\#����
��|2ó��舢�A`���J����q]�WB�Ӷ��b>����Nb��h����o�{����R����{�͝�/{9
�M�Y���oI�8C����&G�Z=`�+4.pL�D�9ˌ�1ʉ�Y25��
��}�V�o�~����c�N6��fﺞ\JX������kZ���;�Vh�ۇ쓷�C�W�at��`J龯�*�+��o���K5���3�J�~Ĝ&�H��G�����P�?�8��צQ�[�}�yȿf�l���D-"5�&��T��]C�<�&������XVJ�
�n�@��/�����7����.|����:&��o��:��s����/8����B?��r+�ȕf�z�*� P��0�������ǧ�u�L�˚n�O��])B;����A.��~W������x��w��1]
4��e�~
�`��(��;�����(&��Ib��X���y��I�ԓ0YQ&�d���GsaQE�ϫ���xg/���le�p����F�Ѻ�^f/�
,����_Ϝ��
1��9f�I�3�|�p
�[�3;������*�r��ʙF���R��(�%�!�����$�nc��`5�x��a]�_�;���c,.�LK-{��'��M�A1y*c��V�V�O�3]&G�pUuՒ����U'm@�
��~�/����d,N}-��S$��Zak �ڴ�_cc5�qQ�W
�,��)���}Z5g�O$�q��"����_�ӽ��w�*�^y��Ɛ~j������������CVlJ��ζ�+�t\ ��g�"#�X�D�9�)���}b�)���$#nn���=]�=?.�
+�G}����'v�k�e�����D���T��a{
il�Z/��Dcq��Z!"\��{�~A4�#;�A0�� �3W���>�eFrހ#Q����`��.�si�?5��Hٙ����V�X�r^��!�[q@MՈ<�%G��f2��ߌz©qA�v������c<�����~�:HΣt�-�Hb�>A��x�D����0e$�����7�����\��Q�7�cJ�t`�R
�n��(���^�~����\FJb@:�0��~��U����C��Żer��5*e\���X
��A8C<��}A5u �T6)|0S�:��f���7^��h{��V��f|�|%U�œ�t� \��xmQ���'�Q����k�g >�3֦J�[����e �(P����0n߾���_��o�=]�UL9n�=�[�0���'\�&ɢ�$�`qno�#�b��c�������+��^T��I�%���]��7�zX0�W��8o7�}�:�ߌ8�����<��.ko��e�lcC��M�n�d��;�8�P�ٔ8L �3��p�}�_!���R�zߣ����J��#�WE�U�$����0�I�
���z���̒� �2�x>�>�u�mu��kwO
�f���Ü�-,��[��w���*^�m������>��>��Ưw���|[�7�j�����X�r��#b4�Uv��S���?j[`K$c��cp]!9V����!ԡ�����W��3Z��dɱ��3�[�̖�|���^~1����MƑ?�!tk|�֧rz�v�����x�]����o3ݾs��X�hUS��NǬqM?��ئ�o*XBb���}lN�������0�Y�i)�4jjf�PC�Գ�/��'���?^�#^�G�9�L�j���.���W��R��@�z�`��W�����s���1��� �G��j-+o��y͙���<�����d�$��
�2->�a,>z�֦Y�?���#�%I��wW� �S����n7�a6�#�x��6\��^K��"d1�i��Gb],orP3���V`NK�~BݒIR���y>t��YK���?�iE�Zx8�cڡ?�3���-�=����j�eWT�ؑ*�w�U�FI%�Y"Q"�Ğ�Z:
<��M
7��F����UBE��[����-E�����m�c�
�
{wZo,%PfP�0�|�o��[���v�8s[](�\N#'��L��r�T�a1_a|S1_b|S1_ca2_a|S2_b|S2_ca3_a|S3_b|S3_ca4_a|S4_b|S4_ca6_a|S6_b|S6_ca8_a|S8_b|S8_ca10_a|S10_b|S10_c�SNODP8�h(���z�ۚ���
h}@bitshuffle; see https://github.com/kiyo-masui/bitshuffle
@C��
r�Ta1_a|S1_b|S1_ca2_a|S2_b|S2_ca3_a|S3_b|S3_ca4_a|S4_b|S4_ca6_a|S6_b|S6_ca8_a|S8_b|S8_ca10_a|S10_b|S10_c�SNODPH�h8��,��}�����
�R�w��S|=OH
��S���WH7^w�F��2+o׆�МjA�)=�Jyx�;;@���*D�v#���q��9�\����"P8]�p�1���X*�Y$��"��P7��`�@����K��E�N�(1L}ȕ��[
��ŧ亪�*���P$e���#P���m�D�������b�~�Bd`P�E�b�C�ޢM����,��'��ڇa����s��3�8J�sWj ̹�$2�"�'��w��.�5�
�mO-���Z��m������B͢ӷn�N3k^�~��1��{���g;<�đ�S�ؓ�}��;�|���ұ�rt�!�1 �_������2�[7ԃ�z�#���d���`����TREE����������������hs��
>�����s��H�`�}��&%aȷg�U>E[ =�v�U�
(k�ϳ6=�σ�>'D �߽���ܓ:S�"���Xv]��o��ý�@��(uV�o��ƫ4W�d����f���O�
��҃��`��+,�ai�Ki���v�mV��9P�r%>�нs��ۏR�;&"���� ���,/DGFTC� BZw%@����nMr3t��=�T��4�O�d�e�W�
?����֎a���=�����>�=hDNE�V�,�w�b��y���m�9W4�H�Ս�s �J��T\K��������Y@�������
��ǁp��"B��;$��tr>)�b�k|�J`
m_�g��$�Ǽ�iV�Ax�n5�:�A`7�k��W��Z�O�6*�5�������Gdk��OK�_��R�;a�S��iv~�}WC�:�HZ���Υry���Ї�z/h��8�qUԑ��9�#r��M��v(��D�3����k�,Y�jlyݸt�wqΐ����t��*�W���������)��̔�O#r1�*i����^��o�1� �`��ho��9`oL�H��~�z���@�Qn+=�����suY@�ȿ�'�����s�J'a�+�f�o�)�&{�N��K?`�I���e�K��7��VT��ͻaP�b�g9>6
�s^�%�`�5��(\�w��*̚��P��V3F{C|��A�gф.����u��'�H���LzEq7��[�!V�%wH�s�
������Fo��jR��+���E����:,$U�
�;c �����q��1�x�^�R��]vD�����*�7�Rʪ�_�j��
�-����H�oR������`�%c3��w��c:����MQK�Y����`���JT5�y��2m^�) u}z��e���?���7�g}����gΥy�t��J��d�a�#�-�k�8������Fs�]�I?2��?D�����}`h,�n���c%(ݣ�=����$��K�;�U]'�t�w�7�R�sR#S��Ò#�h�!{T����B�~��5��l�φnt:��vbH�c���C���jk�I(
p���sr��*A�뒍8+�o�Y�0U뿸,j� !���͓��xU��M���H�c�\4�� �]�=ϵc��1�ɤ4��hi���eJ"^�v�-dW�g"Q�$S�2�)��5Iuw�z{C J�� h���Б�!��qfE�" \)�mK��8�兛������jD:j%^~��c���ĉ��U�������G��&Sl5ʜɛ欀�J���ѯ.����'XZ�W�5��z���DC��p}��鱉���kI��u������R �1W�����͝���$� .��&��]P���yIAZ�ں.�Z8{����Ƈ��zއ��a�6��>.���Y��Q��&oYт9,�-}S�X�9���X�c���,��;v�M|`�`���!Ŭ��43"M��QiǘĢq��������C�D3gc%ꂒƎ�~���!�Jy�e�a��d[Uͺ���GqH�-#m�;E�OĮ}}?u�82k-��x�]��O�caE��5��n�n��c�:Ӥ�����
���q��CM����zg����W*�JB��S��dx�t|OC���33pU�8��a�M��*Fr�.�%�}���f���y\���(��4��6�P�ĕ-AK+4�P��[,W�q}c_�DSl
�P��$a%LZ�Q�(�U��.�=�
$��������YWZAz���̙T��)n>�E��6:7U�[��P7m��e���[����H��cV�#�_"i���%W����Qd��by��j��F�$0�h@V���:��x����? ���&s�L1O��^�9
�ns�4{N <�l(�h�]1�; \���]C��J�B`�3��� ^L�y��GKT���*�������ׯR�nf>f�G�S4�_������$��eG�"������1���?��`�X�"K7c�����<��%p����o��h��3I��>�}E��#-�<�~���^a3!�
e
������+-ux��>��ga�E���5��T�?J�k��V����x��P�"�֪�*i�N��'��W?E��n^I��p��8�7��*�%�l��Y�7s;R�[)wϤ����F��3bp�]�F�Q�f#�;/�Ţ��z�Z��2��]sd�Y�싋#ٿP� ��\r$�q��8X�5�Ӯ�"�
��k�
�j�y�{�C�a֭�~H�-]���^��Hٕ�:~������|3o�lu��G�������U/��Vx�Q�&}�!��z=��;:DW�Q�\����w03ϓ�
��-���~ �a ����5(3�H�Q�7��`�*]1r߀I��E� �HxkC%�/�(��qA�+p���~�[�:H/fH����i�ťq��Fdr��f�u�g�@�g��mUx��()�1��k�e�U���?*'�W�(���(#�V���Ɯ�@�\�r�">~��T$=S��eoޟ-�%ɇ�^)�q��*��a���Te��/]ݗ=;\{����I>������>YU���в�,�l���p�ɱIO�(����a���V�2�T�Z7�J��Ym��C�@[{�r�S=;���ZUf�N����ّ����z�}Z��Y~o��^k��g�
�B�v ;�4�3�*��I-VG�v���I/w�۷��ETk��5�����Vu�ewi�P%}s��R��O�O�z�r�$�m"��g+��n�!�������״��2����|��o��7�����Y��%����W�>f�qnKߪx|z��B<�R�� ^�]4�TREE����������������
���
������s���؊���*��1�3�u�Cmn�U1&����* ��1���st;�ϰ`�|��ꈫ�U}jQ�6�L�7���@ �r;��Lq���~�k�_,Fm�Lh�"a�A��k��/CH��6�Q]���R/����ӷƞZ,���h��ĂHj�aB���M'�g��u\Z���E2'ݦ��QJ���_���O��~�fAGg>���!+I�p�[zB��TnZi�}H�/|��a�Sz���Q{�Yv�Խ\��$,t�C�wI�!z�Ğ�2�qXB�~a}�����������4y�]]���d�� �}W�>ޢ�� ����ٙ�Qt��B��PD����5r��!��h�s�tRP�q,��uܖHu�#�7e����.��]�?24@O�J
*ae�G��Q�O\��/�Lw�����c5�@�wA������+�����#= M�W����3!'��l���Emz�jQ+-e~$F#
5����
��l���bֻJ���v����궺��=]�'����.�gl�S�X4���|T����l�mƍ��\�e��h_#m���Ɏ*����s�>��״�{A��Sd vM���v�� @�ɈK�^�R�������^;1H�(4�+XUR�� ��[s��uD�%�dp_�����I�2-��܊̈́s�7����;K?&
��� G�
خێ�&\O$��V-M5rx���Y��!���ɪcc�j4l�Oh!��Z�eA�VS?�֞��M� o���]H�IƗeG���gw���۲��_����g]��ؑ�G�A��u�ޚѱ�8d9@N3d�`L+�/���4Е�pJ�ye��f2��2i�,�������49C?�ޭ�hSz�������s&j_�
��⌜�x�Pw��f9�+�t��N^>���x���b�f����>��w8��;L23j�wpm6G×�"g���Q�:��
ͪZuFTݼ��,�8C�o�p^L$}S�wr]]*��]/�H��y��r7(���^=��2(�d�A$7�R��vȏC����"� ����C}��+Im�J���T�+CX�WWadyW�fe��h~�q���v��'�v�|P��:+G���&�ټ���*�̰�:�
xz.=tz�J���A����{�|u2�c"��-ӝF GV�i�E��k_x1 B�R�j�=ҁ,Ƶ�H�������$1����/�~7K�2�Q30�*Q�����٬:�n�}fe����h�̌�1O�g��<�c��ˏ�P~�P�����cΩ�A3��2��'4���#���̗�^�*��/�^�����;���9�V�>��9h�yJp,��k��)������f�y��E~�X���VR$_��=2p��I�n����W.�U�Mj�s�"����G�I\���k��ct�ذ��3p��Gf���;�&�*u��'�<��0�Î��D�eU��T��f��yk�;N������p#]��o��sbl�#\�X�h��Prv��=X֙�z�Y�\�Y���Ȉ���_��;�0��
,K�FBt��xAQ�.�z��u�
d;]g��!��]�a�ј��_,���OW+�9�Qh���$�g_|X�o*x���6t3��
��NA��< \��h%��e��5�8ԛ�����E�lƝؚG�`�yV [t�M@����"�u. ��M�������c�e�&UyΓ
�
\���LH��}���cԜ��J���d�Y���,L�aY�M
��
���I�B��3�3t�N��w/�\,*,Pu'F��C�m��ѕ1% Gz��Yi9��"�O��L�
bNS����KLQ��͚ͮ!ak%ݛ���2����|��o��7�����Y��%����W�>f�qnKߪx|z��B<�R�� ^�]4�bitshuffle-0.3.5/bitshuffle/tests/make_regression_tdata.py000066400000000000000000000022161337005776700240730ustar00rootroot00000000000000"""
Script to create data used for regression testing.
"""
import numpy as np
from numpy import random
import h5py
import bitshuffle
from bitshuffle import h5
BLOCK_SIZE = 64 # Smallish such that datasets have many blocks but are small.
FILTER_PIPELINE = [h5.H5FILTER,]
FILTER_OPTS = [(BLOCK_SIZE, h5.H5_COMPRESS_LZ4)]
OUT_FILE = "bitshuffle/tests/data/regression_%s.h5" % bitshuffle.__version__
DTYPES = ['a1', 'a2', 'a3', 'a4', 'a6', 'a8', 'a10']
f = h5py.File(OUT_FILE, 'w')
g_comp = f.create_group("compressed")
g_orig = f.create_group("origional")
for dtype in DTYPES:
for rep in ['a', 'b', 'c']:
dset_name = "%s_%s" % (dtype, rep)
dtype = np.dtype(dtype)
n_elem = 3 * BLOCK_SIZE + random.randint(0, BLOCK_SIZE)
shape = (n_elem,)
chunks = shape
data = random.randint(0, 255, n_elem * dtype.itemsize)
data = data.astype(np.uint8).view(dtype)
g_orig.create_dataset(dset_name, data=data)
h5.create_dataset(g_comp, dset_name, shape, dtype, chunks=chunks,
filter_pipeline=FILTER_PIPELINE, filter_opts=FILTER_OPTS)
g_comp[dset_name][:] = data
f.close()
bitshuffle-0.3.5/bitshuffle/tests/test_ext.py000066400000000000000000000506341337005776700214070ustar00rootroot00000000000000from __future__ import absolute_import, division, print_function, unicode_literals
import unittest
import time
import timeit
import numpy as np
from numpy import random
from bitshuffle import ext
# If we are doing timeings by what factor to increase workload.
# Remember to change `ext.REPEATC`.
TIME = 0
#TIME = 8 # 8kB blocks same as final blocking.
BLOCK = 1024
TEST_DTYPES = [np.uint8, np.uint16, np.int32, np.uint64, np.float32,
np.float64, np.complex128]
TEST_DTYPES += [b'a3', b'a5', b'a6', b'a7', b'a9', b'a11', b'a12', b'a24',
b'a48']
class TestProfile(unittest.TestCase):
def setUp(self):
n = 1024 # bytes.
if TIME:
n *= TIME
# Almost random bits, but now quite. All bits exercised (to fully test
# transpose) but still slightly compresible.
self.data = random.randint(0, 200, n).astype(np.uint8)
self.fun = ext.copy
self.check = None
self.check_data = None
self.case = "None"
def tearDown(self):
"""Performs all tests and timings."""
if TIME:
reps = 10
else:
reps = 1
delta_ts = []
try:
for ii in range(reps):
t0 = time.time()
out = self.fun(self.data)
delta_ts.append(time.time() - t0)
except RuntimeError as err:
if (len(err.args) > 1 and (err.args[1] == -11)
and not ext.using_SSE2()):
return
if (len(err.args) > 1 and (err.args[1] == -12)
and not ext.using_AVX2()):
return
else:
raise
delta_t = min(delta_ts)
size_i = self.data.size * self.data.dtype.itemsize
size_o = out.size * out.dtype.itemsize
size = max([size_i, size_o])
speed = (ext.REPEAT * size / delta_t / 1024**3) # GB/s
if TIME:
print("%-20s: %5.2f s/GB, %5.2f GB/s" % (self.case, 1./speed, speed))
if not self.check is None:
ans = self.check(self.data).view(np.uint8)
self.assertTrue(np.all(ans == out.view(np.uint8)))
if not self.check_data is None:
ans = self.check_data.view(np.uint8)
self.assertTrue(np.all(ans == out.view(np.uint8)))
def test_00_copy(self):
self.case = "copy"
self.fun = ext.copy
self.check = lambda x: x
def test_01a_trans_byte_elem_scal_16(self):
self.case = "byte T elem scal 16"
self.data = self.data.view(np.int16)
self.fun = ext.trans_byte_elem_scal
self.check = trans_byte_elem
def test_01b_trans_byte_elem_scal_32(self):
self.case = "byte T elem scal 32"
self.data = self.data.view(np.int32)
self.fun = ext.trans_byte_elem_scal
self.check = trans_byte_elem
def test_01c_trans_byte_elem_scal_64(self):
self.case = "byte T elem scal 64"
self.data = self.data.view(np.int64)
self.fun = ext.trans_byte_elem_scal
self.check = trans_byte_elem
def test_01d_trans_byte_elem_16(self):
self.case = "byte T elem SSE 16"
self.data = self.data.view(np.int16)
self.fun = ext.trans_byte_elem_SSE
self.check = trans_byte_elem
def test_01e_trans_byte_elem_32(self):
self.case = "byte T elem SSE 32"
self.data = self.data.view(np.float32)
self.fun = ext.trans_byte_elem_SSE
self.check = trans_byte_elem
def test_01f_trans_byte_elem_64(self):
self.case = "byte T elem SSE 64"
self.data = self.data.view(np.float64)
self.fun = ext.trans_byte_elem_SSE
self.check = trans_byte_elem
def test_01g_trans_byte_elem_128(self):
self.case = "byte T elem SSE 128"
self.data = self.data.view(np.complex128)
self.fun = ext.trans_byte_elem_SSE
self.check = trans_byte_elem
def test_01h_trans_byte_elem_96(self):
self.case = "byte T elem SSE 96"
n = self.data.size // 128 * 96
dt = np.dtype([(str('a'), np.int32), (str('b'), np.int32),
(str('c'), np.int32)])
self.data = self.data[:n].view(dt)
self.fun = ext.trans_byte_elem_SSE
self.check = trans_byte_elem
def test_01i_trans_byte_elem_80(self):
self.case = "byte T elem SSE 80"
n = self.data.size // 128 * 80
dt = np.dtype([(str('a'), np.int16), (str('b'), np.int16),
(str('c'), np.int16), (str('d'), np.int16),
(str('e'), np.int16)])
self.data = self.data[:n].view(dt)
self.fun = ext.trans_byte_elem_SSE
self.check = trans_byte_elem
def test_03a_trans_bit_byte(self):
self.case = "bit T byte scal 64"
self.data = self.data.view(np.float64)
self.fun = ext.trans_bit_byte_scal
self.check = trans_bit_byte
def test_03d_trans_bit_byte_SSE(self):
self.case = "bit T byte SSE 64"
self.data = self.data.view(np.float64)
self.fun = ext.trans_bit_byte_SSE
self.check = trans_bit_byte
def test_03f_trans_bit_byte_AVX(self):
self.case = "bit T byte AVX 64"
self.data = self.data.view(np.float64)
self.fun = ext.trans_bit_byte_AVX
self.check = trans_bit_byte
def test_03g_trans_bit_byte_AVX_32(self):
self.case = "bit T byte AVX 32"
self.data = self.data.view(np.float32)
self.fun = ext.trans_bit_byte_AVX
self.check = trans_bit_byte
def test_04a_trans_bit_elem_AVX(self):
self.case = "bit T elem AVX 64"
self.data = self.data.view(np.float64)
self.fun = ext.trans_bit_elem_AVX
self.check = trans_bit_elem
def test_04b_trans_bit_elem_AVX_128(self):
self.case = "bit T elem AVX 128"
self.data = self.data.view(np.complex128)
self.fun = ext.trans_bit_elem_AVX
self.check = trans_bit_elem
def test_04c_trans_bit_elem_AVX_32(self):
self.case = "bit T elem AVX 32"
self.data = self.data.view(np.float32)
self.fun = ext.trans_bit_elem_AVX
self.check = trans_bit_elem
def test_04d_trans_bit_elem_AVX_16(self):
self.case = "bit T elem AVX 16"
self.data = self.data.view(np.int16)
self.fun = ext.trans_bit_elem_AVX
self.check = trans_bit_elem
def test_04e_trans_bit_elem_64(self):
self.case = "bit T elem scal 64"
self.data = self.data.view(np.float64)
self.fun = ext.trans_bit_elem_scal
self.check = trans_bit_elem
def test_04f_trans_bit_elem_SSE_32(self):
self.case = "bit T elem SSE 32"
self.data = self.data.view(np.float32)
self.fun = ext.trans_bit_elem_SSE
self.check = trans_bit_elem
def test_04g_trans_bit_elem_SSE_64(self):
self.case = "bit T elem SSE 64"
self.data = self.data.view(np.float64)
self.fun = ext.trans_bit_elem_SSE
self.check = trans_bit_elem
def test_06a_untrans_bit_elem_16(self):
self.case = "bit U elem SSE 16"
pre_trans = self.data.view(np.int16)
self.data = trans_bit_elem(pre_trans)
self.fun = ext.untrans_bit_elem_SSE
self.check_data = pre_trans
def test_06b_untrans_bit_elem_128(self):
self.case = "bit U elem SSE 128"
pre_trans = self.data.view(np.complex128)
self.data = trans_bit_elem(pre_trans)
self.fun = ext.untrans_bit_elem_SSE
self.check_data = pre_trans
def test_06c_untrans_bit_elem_32(self):
self.case = "bit U elem SSE 32"
pre_trans = self.data.view(np.float32)
self.data = trans_bit_elem(pre_trans)
self.fun = ext.untrans_bit_elem_SSE
self.check_data = pre_trans
def test_06d_untrans_bit_elem_32(self):
self.case = "bit U elem AVX 32"
pre_trans = self.data.view(np.float32)
self.data = trans_bit_elem(pre_trans)
self.fun = ext.untrans_bit_elem_AVX
self.check_data = pre_trans
def test_06e_untrans_bit_elem_64(self):
self.case = "bit U elem SSE 64"
pre_trans = self.data.view(np.float64)
self.data = trans_bit_elem(pre_trans)
self.fun = ext.untrans_bit_elem_SSE
self.check_data = pre_trans
def test_06f_untrans_bit_elem_64(self):
self.case = "bit U elem AVX 64"
pre_trans = self.data.view(np.float64)
self.data = trans_bit_elem(pre_trans)
self.fun = ext.untrans_bit_elem_AVX
self.check_data = pre_trans
def test_06g_untrans_bit_elem_64(self):
self.case = "bit U elem scal 64"
pre_trans = self.data.view(np.float64)
self.data = trans_bit_elem(pre_trans)
self.fun = ext.untrans_bit_elem_scal
self.check_data = pre_trans
def test_07a_trans_byte_bitrow_64(self):
self.case = "byte T row scal 64"
self.data = self.data.view(np.float64)
self.fun = ext.trans_byte_bitrow_scal
def test_07b_trans_byte_bitrow_SSE_64(self):
self.case = "byte T row SSE 64"
self.data = self.data.view(np.float64)
self.fun = ext.trans_byte_bitrow_SSE
self.check = ext.trans_byte_bitrow_scal
def test_07c_trans_byte_bitrow_AVX_64(self):
self.case = "byte T row AVX 64"
self.data = self.data.view(np.float64)
self.fun = ext.trans_byte_bitrow_AVX
self.check = ext.trans_byte_bitrow_scal
def test_08a_shuffle_bit_eight_scal_64(self):
self.case = "bit S eight scal 64"
self.data = self.data.view(np.float64)
self.fun = ext.shuffle_bit_eightelem_scal
def test_08b_shuffle_bit_eight_SSE_64(self):
self.case = "bit S eight SSE 64"
self.data = self.data.view(np.float64)
self.fun = ext.shuffle_bit_eightelem_SSE
self.check = ext.shuffle_bit_eightelem_scal
def test_08c_shuffle_bit_eight_AVX_32(self):
self.case = "bit S eight AVX 32"
self.data = self.data.view(np.float32)
self.fun = ext.shuffle_bit_eightelem_AVX
self.check = ext.shuffle_bit_eightelem_scal
def test_08d_shuffle_bit_eight_AVX_64(self):
self.case = "bit S eight AVX 64"
self.data = self.data.view(np.float64)
self.fun = ext.shuffle_bit_eightelem_AVX
self.check = ext.shuffle_bit_eightelem_scal
def test_08e_shuffle_bit_eight_AVX_16(self):
self.case = "bit S eight AVX 16"
self.data = self.data.view(np.int16)
self.fun = ext.shuffle_bit_eightelem_AVX
self.check = ext.shuffle_bit_eightelem_scal
def test_08f_shuffle_bit_eight_AVX_128(self):
self.case = "bit S eight AVX 128"
self.data = self.data.view(np.complex128)
self.fun = ext.shuffle_bit_eightelem_AVX
self.check = ext.shuffle_bit_eightelem_scal
def test_09a_trans_bit_elem_scal_64(self):
self.case = "bit T elem scal 64"
self.data = self.data.view(np.float64)
self.fun = ext.trans_bit_elem_scal
self.check = trans_bit_elem
def test_09b_trans_bit_elem_SSE_64(self):
self.case = "bit T elem SSE 64"
self.data = self.data.view(np.float64)
self.fun = ext.trans_bit_elem_SSE
self.check = trans_bit_elem
def test_09c_trans_bit_elem_AVX_64(self):
self.case = "bit T elem AVX 64"
self.data = self.data.view(np.float64)
self.fun = ext.trans_bit_elem_AVX
self.check = trans_bit_elem
def test_09d_untrans_bit_elem_scal_64(self):
self.case = "bit U elem scal 64"
pre_trans = self.data.view(np.float64)
self.data = trans_bit_elem(pre_trans)
self.fun = ext.untrans_bit_elem_scal
self.check_data = pre_trans
def test_09e_untrans_bit_elem_SSE_64(self):
self.case = "bit U elem SSE 64"
pre_trans = self.data.view(np.float64)
self.data = trans_bit_elem(pre_trans)
self.fun = ext.untrans_bit_elem_SSE
self.check_data = pre_trans
def test_09f_untrans_bit_elem_AVX_64(self):
self.case = "bit U elem AVX 64"
pre_trans = self.data.view(np.float64)
self.data = trans_bit_elem(pre_trans)
self.fun = ext.untrans_bit_elem_AVX
self.check_data = pre_trans
def test_10a_bitshuffle_64(self):
self.case = "bitshuffle 64"
self.data = self.data.view(np.float64)
self.fun = lambda x: ext.bitshuffle(x, BLOCK)
def test_10b_bitunshuffle_64(self):
self.case = "bitunshuffle 64"
pre_trans = self.data.view(np.float64)
self.data = ext.bitshuffle(pre_trans, BLOCK)
self.fun = lambda x: ext.bitunshuffle(x, BLOCK)
self.check_data = pre_trans
def test_10c_compress_64(self):
self.case = "compress 64"
self.data = self.data.view(np.float64)
self.fun = lambda x:ext.compress_lz4(x, BLOCK)
def test_10d_decompress_64(self):
self.case = "decompress 64"
pre_trans = self.data.view(np.float64)
self.data = ext.compress_lz4(pre_trans, BLOCK)
self.fun = lambda x: ext.decompress_lz4(x, pre_trans.shape,
pre_trans.dtype, BLOCK)
self.check_data = pre_trans
"""
Commented out to prevent nose from finding them.
class TestDevCases(unittest.TestCase):
def deactivated_test_trans_byte_bitrow_AVX(self):
d = np.arange(256, dtype=np.uint32)
#d = ext.trans_bit_elem(d)
t = ext.trans_byte_bitrow_AVX(d).view(np.uint8)
t1 = ext.trans_byte_bitrow_SSE(d).view(np.uint8)
t.shape = (32, 32)
t1.shape = (32, 32)
#print t[:20,:18]
self.assertTrue(np.all(t == t1))
def deactivated_test_untrans_bit_elem(self):
d = np.arange(32, dtype=np.uint16)
#d = random.randint(0, 2**7, 256).astype(np.uint16)
d1 = ext.trans_bit_elem(d)
#print d
t = ext.untrans_bit_elem_AVX(d1)
#t1 = ext.untrans_bit_byte_scal(d1)
#print np.reshape(d1.view(np.uint8), (16, 4))
#print np.reshape(t1.view(np.uint8), (2, 32))
#print np.reshape(t2.view(np.uint8), (32, 2))
#print np.reshape(t.view(np.uint8), (32, 2))
def deactivated_test_trans_bit_byte(self):
d = np.arange(16, dtype=np.uint16)
t = ext.trans_bit_byte_scal(d)
#print t
t1 = trans_bit_byte(d)
#print t1
self.assertTrue(np.all(t == t1))
def deactivated_test_trans_byte_bitrow_SSE(self):
d = np.arange(256, dtype = np.uint8)
t = ext.trans_byte_bitrow_scal(d)
#print np.reshape(t, (32, 8))
t1 = ext.trans_byte_bitrow_SSE(d)
#print np.reshape(t1, (32, 8))
self.assertTrue(np.all(t == t1))
def deactivated_test_trans_byte_elem_SSE(self):
d = np.empty(16, dtype=([('a', 'u4'), ('b', 'u4'), ('c', 'u4')]))
d['a'] = np.arange(16) * 1
d['b'] = np.arange(16) * 2
d['c'] = np.arange(16) * 3
#print d.dtype.itemsize
#print np.reshape(d.view(np.uint8), (16, 12))
t1 = ext.trans_byte_elem_SSE(d)
#print np.reshape(t1.view(np.uint8), (12, 16))
t0 = trans_byte_elem(d)
#print np.reshape(t0.view(np.uint8), (12, 16))
self.assertTrue(np.all(t0.view(np.uint8) == t1.view(np.uint8)))
def deactivated_test_bitshuffle(self):
d = np.arange(128, dtype=np.uint16)
t1 = ext.bitshuffle(d)
#print t1
t2 = ext.bitunshuffle(t1)
#print t2
self.assertTrue(np.all(t2.view(np.uint8) == d.view(np.uint8)))
"""
class TestOddLengths(unittest.TestCase):
def setUp(self):
self.reps = 10
self.nmax = 128 * 8
#self.nmax = 4 * 8 # XXX
self.fun = ext.copy
self.check = lambda x: x
def test_trans_bit_elem_SSE(self):
self.fun = ext.trans_bit_elem_SSE
self.check = trans_bit_elem
def test_untrans_bit_elem_SSE(self):
self.fun = lambda x: ext.untrans_bit_elem_SSE(ext.trans_bit_elem(x))
self.check = lambda x: x
def test_trans_bit_elem_AVX(self):
self.fun = ext.trans_bit_elem_AVX
self.check = trans_bit_elem
def test_untrans_bit_elem_AVX(self):
self.fun = lambda x: ext.untrans_bit_elem_SSE(ext.trans_bit_elem(x))
self.check = lambda x: x
def test_trans_bit_elem_scal(self):
self.fun = ext.trans_bit_elem_scal
self.check = trans_bit_elem
def test_untrans_bit_elem_scal(self):
self.fun = lambda x: ext.untrans_bit_elem_scal(ext.trans_bit_elem(x))
self.check = lambda x: x
def test_trans_byte_elem_SSE(self):
self.fun = ext.trans_byte_elem_SSE
self.check = trans_byte_elem
def tearDown(self):
try:
for dtype in TEST_DTYPES:
itemsize = np.dtype(dtype).itemsize
nbyte_max = self.nmax * itemsize
dbuf = random.randint(0, 255, nbyte_max).astype(np.uint8)
dbuf = dbuf.view(dtype)
for ii in range(self.reps):
n = random.randint(0, self.nmax // 8, 1)[0] * 8
data = dbuf[:n]
out = self.fun(data).view(np.uint8)
ans = self.check(data).view(np.uint8)
self.assertTrue(np.all(out == ans))
except RuntimeError as err:
if (len(err.args) > 1 and (err.args[1] == -11)
and not ext.using_SSE2()):
return
if (len(err.args) > 1 and (err.args[1] == -12)
and not ext.using_AVX2()):
return
else:
raise
class TestBitShuffleCircle(unittest.TestCase):
"""Ensure that final filter is circularly consistant for any data type and
any length buffer."""
def test_circle(self):
nmax = 100000
reps = 20
for dtype in TEST_DTYPES:
itemsize = np.dtype(dtype).itemsize
nbyte_max = nmax * itemsize
dbuf = random.randint(0, 255, nbyte_max).astype(np.uint8)
dbuf = dbuf.view(dtype)
for ii in range(reps):
n = random.randint(0, nmax, 1)[0]
data = dbuf[:n]
shuff = ext.bitshuffle(data)
out = ext.bitunshuffle(shuff)
self.assertTrue(out.dtype is data.dtype)
self.assertTrue(np.all(data.view(np.uint8)
== out.view(np.uint8)))
def test_circle_with_compression(self):
nmax = 100000
reps = 20
for dtype in TEST_DTYPES:
itemsize = np.dtype(dtype).itemsize
nbyte_max = nmax * itemsize
dbuf = random.randint(0, 255, nbyte_max).astype(np.uint8)
dbuf = dbuf.view(dtype)
for ii in range(reps):
n = random.randint(0, nmax, 1)[0]
data = dbuf[:n]
shuff = ext.compress_lz4(data)
out = ext.decompress_lz4(shuff, data.shape, data.dtype)
self.assertTrue(out.dtype is data.dtype)
self.assertTrue(np.all(data.view(np.uint8)
== out.view(np.uint8)))
# Python implementations for checking results.
def trans_byte_elem(arr):
dtype = arr.dtype
itemsize = dtype.itemsize
in_buf = arr.flat[:].view(np.uint8)
nelem = in_buf.size // itemsize
in_buf.shape = (nelem, itemsize)
out_buf = np.empty((itemsize, nelem), dtype=np.uint8)
for ii in range(nelem):
for jj in range(itemsize):
out_buf[jj,ii] = in_buf[ii,jj]
return out_buf.flat[:].view(dtype)
def trans_bit_byte(arr):
n = arr.size
dtype = arr.dtype
itemsize = dtype.itemsize
bits = np.unpackbits(arr.view(np.uint8))
bits.shape = (n * itemsize, 8)
# We have to reverse the order of the bits both for unpacking and packing,
# since we want to call the least significant bit the first bit.
bits = bits[:,::-1]
bits_shuff = (bits.T).copy()
bits_shuff.shape = (n * itemsize, 8)
bits_shuff = bits_shuff[:,::-1]
arr_bt = np.packbits(bits_shuff.flat[:])
return arr_bt.view(dtype)
def trans_bit_elem(arr):
n = arr.size
dtype = arr.dtype
itemsize = dtype.itemsize
bits = np.unpackbits(arr.view(np.uint8))
bits.shape = (n * itemsize, 8)
# We have to reverse the order of the bits both for unpacking and packing,
# since we want to call the least significant bit the first bit.
bits = bits[:,::-1].copy()
bits.shape = (n, itemsize * 8)
bits_shuff = (bits.T).copy()
bits_shuff.shape = (n * itemsize, 8)
bits_shuff = bits_shuff[:,::-1]
arr_bt = np.packbits(bits_shuff.flat[:])
return arr_bt.view(dtype)
if __name__ == "__main__":
unittest.main()
bitshuffle-0.3.5/bitshuffle/tests/test_h5filter.py000066400000000000000000000047161337005776700223310ustar00rootroot00000000000000from __future__ import absolute_import, division, print_function, unicode_literals
import unittest
import os
import glob
import numpy as np
import h5py
from h5py import h5f, h5d, h5z, h5t, h5s, filters
from subprocess import Popen, PIPE, STDOUT
from bitshuffle import h5
os.environ["HDF5_PLUGIN_PATH"] = ""
class TestFilter(unittest.TestCase):
def test_filter(self):
shape = (32 * 1024 + 783,)
chunks = (4 * 1024 + 23,)
dtype = np.int64
data = np.arange(shape[0])
fname = "tmp_test_filters.h5"
f = h5py.File(fname)
h5.create_dataset(f, b"range", shape, dtype, chunks,
filter_pipeline=(32008, 32000),
filter_flags=(h5z.FLAG_MANDATORY, h5z.FLAG_MANDATORY),
filter_opts=None)
f["range"][:] = data
f.close()
f = h5py.File(fname, 'r')
d = f['range'][:]
self.assertTrue(np.all(d == data))
f.close()
def test_with_block_size(self):
shape = (128 * 1024 + 783,)
chunks = (4 * 1024 + 23,)
dtype = np.int64
data = np.arange(shape[0])
fname = "tmp_test_filters.h5"
f = h5py.File(fname)
h5.create_dataset(f, b"range", shape, dtype, chunks,
filter_pipeline=(32008, 32000),
filter_flags=(h5z.FLAG_MANDATORY, h5z.FLAG_MANDATORY),
filter_opts=((680,), ()),
)
f["range"][:] = data
f.close()
#os.system('h5dump -H -p tmp_test_filters.h5')
f = h5py.File(fname, 'r')
d = f['range'][:]
self.assertTrue(np.all(d == data))
f.close()
def test_with_compression(self):
shape = (128 * 1024 + 783,)
chunks = (4 * 1024 + 23,)
dtype = np.int64
data = np.arange(shape[0])
fname = "tmp_test_filters.h5"
f = h5py.File(fname)
h5.create_dataset(f, b"range", shape, dtype, chunks,
filter_pipeline=(32008,),
filter_flags=(h5z.FLAG_MANDATORY,),
filter_opts=((0, h5.H5_COMPRESS_LZ4),),
)
f["range"][:] = data
f.close()
#os.system('h5dump -H -p tmp_test_filters.h5')
f = h5py.File(fname, 'r')
d = f['range'][:]
self.assertTrue(np.all(d == data))
f.close()
def tearDown(self):
files = glob.glob("tmp_test_*")
for f in files:
os.remove(f)
if __name__ == "__main__":
unittest.main()
bitshuffle-0.3.5/bitshuffle/tests/test_h5plugin.py000066400000000000000000000046101337005776700223330ustar00rootroot00000000000000from __future__ import absolute_import, division, print_function, unicode_literals
import unittest
import os, os.path
import glob
import numpy as np
import h5py
from h5py import h5f, h5d, h5z, h5t, h5s, filters
from subprocess import Popen, PIPE, STDOUT
import bitshuffle
plugin_dir = os.path.join(os.path.dirname(bitshuffle.__file__),
'plugin')
os.environ["HDF5_PLUGIN_PATH"] = plugin_dir
H5VERSION = h5py.h5.get_libversion()
if (H5VERSION[0] < 1 or (H5VERSION[0] == 1
and (H5VERSION[1] < 8 or (H5VERSION[1] == 8 and H5VERSION[2] < 11)))):
H51811P = False
else:
H51811P = True
class TestFilterPlugins(unittest.TestCase):
def test_plugins(self):
if not H51811P:
return
shape = (32 * 1024,)
chunks = (4 * 1024,)
dtype = np.int64
data = np.arange(shape[0])
fname = "tmp_test_filters.h5"
f = h5py.File(fname)
tid = h5t.py_create(dtype, logical=1)
sid = h5s.create_simple(shape, shape)
# Different API's for different h5py versions.
try:
dcpl = filters.generate_dcpl(shape, dtype, chunks, None, None,
None, None, None, None)
except TypeError:
dcpl = filters.generate_dcpl(shape, dtype, chunks, None, None,
None, None, None)
dcpl.set_filter(32008, h5z.FLAG_MANDATORY)
dcpl.set_filter(32000, h5z.FLAG_MANDATORY)
dset_id = h5d.create(f.id, b"range", tid, sid, dcpl=dcpl)
dset_id.write(h5s.ALL, h5s.ALL, data)
f.close()
# Make sure the filters are working outside of h5py by calling h5dump
h5dump = Popen(['h5dump', fname],
stdout=PIPE, stderr=STDOUT)
stdout, nothing = h5dump.communicate()
err = h5dump.returncode
self.assertEqual(err, 0)
f = h5py.File(fname, 'r')
d = f['range'][:]
self.assertTrue(np.all(d == data))
f.close()
#def test_h5py_hl(self):
# if not H51811P:
# return
# # Does not appear to be supported by h5py.
# fname = "tmp_test_h5py_hl.h5"
# f = h5py.File(fname)
# f.create_dataset("range", np.arange(1024, dtype=np.int64),
# compression=32008)
def tearDown(self):
files = glob.glob("tmp_test_*")
for f in files:
os.remove(f)
if __name__ == "__main__":
unittest.main()
bitshuffle-0.3.5/bitshuffle/tests/test_regression.py000066400000000000000000000016001337005776700227540ustar00rootroot00000000000000"""
Test that data encoded with earlier versions can still be decoded correctly.
"""
from __future__ import absolute_import, division, print_function
import unittest
from os import path
import numpy as np
import h5py
import bitshuffle
from bitshuffle import h5
TEST_DATA_DIR = path.dirname(bitshuffle.__file__) + "/tests/data"
OUT_FILE_TEMPLATE = TEST_DATA_DIR + "/regression_%s.h5"
VERSIONS = ["0.1.3",]
class TestAll(unittest.TestCase):
def test_regression(self):
for version in VERSIONS:
file_name = OUT_FILE_TEMPLATE % version
f = h5py.File(file_name)
g_orig = f["origional"]
g_comp = f["compressed"]
for dset_name in g_comp.keys():
self.assertTrue(np.all(g_comp[dset_name][:]
== g_orig[dset_name][:]))
if __name__ == "__main__":
unittest.main()
bitshuffle-0.3.5/conda-recipe/000077500000000000000000000000001337005776700162225ustar00rootroot00000000000000bitshuffle-0.3.5/conda-recipe/bld.bat000066400000000000000000000001131337005776700174460ustar00rootroot00000000000000SET CONDA_HOME=%PREFIX%
"%PYTHON%" setup.py install
if errorlevel 1 exit 1
bitshuffle-0.3.5/conda-recipe/build.sh000066400000000000000000000001361337005776700176550ustar00rootroot00000000000000export CONDA_HOME=$PREFIX
$PYTHON setup.py install # Python command to install the script
bitshuffle-0.3.5/conda-recipe/meta.yaml000066400000000000000000000007671337005776700200460ustar00rootroot00000000000000package:
name: bitshuffle
version: 0.2.1
source:
# git_url: https://github.com/kiyo-masui/bitshuffle.git
# git_rev: 0.2.1
path: ..
patches:
- setup.py.patch
requirements:
build:
- python
- setuptools
- cython
- numpy
- h5py
- hdf5
run:
- python
- numpy
- h5py
- cython
about:
home: https://github.com/kiyo-masui/bitshuffle/blob/master/setup.py
summary: "bitshuffle library."
bitshuffle-0.3.5/conda-recipe/setup.py.patch000066400000000000000000000007671337005776700210440ustar00rootroot00000000000000--- setup.py 2016-01-19 16:56:12.954563000 +0100
+++ xxx.py 2016-01-19 16:56:00.817087000 +0100
@@ -40,8 +40,8 @@
# Copied from h5py.
# TODO, figure out what the canonacal way to do this should be.
-INCLUDE_DIRS = []
-LIBRARY_DIRS = []
+INCLUDE_DIRS = [os.environ['CONDA_HOME'] + '/include']
+LIBRARY_DIRS = [os.environ['CONDA_HOME'] + '/lib']
if sys.platform == 'darwin':
# putting here both macports and homebrew paths will generate
# "ld: warning: dir not found" at the linking phase
bitshuffle-0.3.5/lz4/000077500000000000000000000000001337005776700144025ustar00rootroot00000000000000bitshuffle-0.3.5/lz4/LICENSE000066400000000000000000000024361337005776700154140ustar00rootroot00000000000000LZ4 Library
Copyright (c) 2011-2014, Yann Collet
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this
list of conditions and the following disclaimer in the documentation and/or
other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.bitshuffle-0.3.5/lz4/README.md000066400000000000000000000027621337005776700156700ustar00rootroot00000000000000LZ4 - Library Files
================================
The __lib__ directory contains several files, but you don't necessarily need them all.
To integrate fast LZ4 compression/decompression into your program, you basically just need "**lz4.c**" and "**lz4.h**".
For more compression at the cost of compression speed (while preserving decompression speed), use **lz4hc** on top of regular lz4. `lz4hc` only provides compression functions. It also needs `lz4` to compile properly.
If you want to produce files or data streams compatible with `lz4` command line utility, use **lz4frame**. This library encapsulates lz4-compressed blocks into the [official interoperable frame format]. In order to work properly, lz4frame needs lz4 and lz4hc, and also **xxhash**, which provides error detection algorithm.
(_Advanced stuff_ : It's possible to hide xxhash symbols into a local namespace. This is what `liblz4` does, to avoid symbol duplication in case a user program would link to several libraries containing xxhash symbols.)
A more complex "lz4frame_static.h" is also provided, although its usage is not recommended. It contains definitions which are not guaranteed to remain stable within future versions. Use for static linking ***only***.
The other files are not source code. There are :
- LICENSE : contains the BSD license text
- Makefile : script to compile or install lz4 library (static or dynamic)
- liblz4.pc.in : for pkg-config (make install)
[official interoperable frame format]: ../lz4_Frame_format.md
bitshuffle-0.3.5/lz4/lz4.c000066400000000000000000001531321337005776700152640ustar00rootroot00000000000000/*
LZ4 - Fast LZ compression algorithm
Copyright (C) 2011-2015, Yann Collet.
BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
You can contact the author at :
- LZ4 source repository : https://github.com/Cyan4973/lz4
- LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
*/
/**************************************
* Tuning parameters
**************************************/
/*
* HEAPMODE :
* Select how default compression functions will allocate memory for their hash table,
* in memory stack (0:default, fastest), or in memory heap (1:requires malloc()).
*/
#define HEAPMODE 0
/*
* ACCELERATION_DEFAULT :
* Select "acceleration" for LZ4_compress_fast() when parameter value <= 0
*/
#define ACCELERATION_DEFAULT 1
/**************************************
* CPU Feature Detection
**************************************/
/*
* LZ4_FORCE_SW_BITCOUNT
* Define this parameter if your target system or compiler does not support hardware bit count
*/
#if defined(_MSC_VER) && defined(_WIN32_WCE) /* Visual Studio for Windows CE does not support Hardware bit count */
# define LZ4_FORCE_SW_BITCOUNT
#endif
/**************************************
* Includes
**************************************/
#include "lz4.h"
/**************************************
* Compiler Options
**************************************/
#ifdef _MSC_VER /* Visual Studio */
# define FORCE_INLINE static __forceinline
# include
# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
# pragma warning(disable : 4293) /* disable: C4293: too large shift (32-bits) */
#else
# if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */
# if defined(__GNUC__) || defined(__clang__)
# define FORCE_INLINE static inline __attribute__((always_inline))
# else
# define FORCE_INLINE static inline
# endif
# else
# define FORCE_INLINE static
# endif /* __STDC_VERSION__ */
#endif /* _MSC_VER */
/* LZ4_GCC_VERSION is defined into lz4.h */
#if (LZ4_GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__)
# define expect(expr,value) (__builtin_expect ((expr),(value)) )
#else
# define expect(expr,value) (expr)
#endif
#define likely(expr) expect((expr) != 0, 1)
#define unlikely(expr) expect((expr) != 0, 0)
/**************************************
* Memory routines
**************************************/
#include /* malloc, calloc, free */
#define ALLOCATOR(n,s) calloc(n,s)
#define FREEMEM free
#include /* memset, memcpy */
#define MEM_INIT memset
/**************************************
* Basic Types
**************************************/
#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */
# include
typedef uint8_t BYTE;
typedef uint16_t U16;
typedef uint32_t U32;
typedef int32_t S32;
typedef uint64_t U64;
#else
typedef unsigned char BYTE;
typedef unsigned short U16;
typedef unsigned int U32;
typedef signed int S32;
typedef unsigned long long U64;
#endif
/**************************************
* Reading and writing into memory
**************************************/
#define STEPSIZE sizeof(size_t)
static unsigned LZ4_64bits(void) { return sizeof(void*)==8; }
static unsigned LZ4_isLittleEndian(void)
{
const union { U32 i; BYTE c[4]; } one = { 1 }; /* don't use static : performance detrimental */
return one.c[0];
}
static U16 LZ4_read16(const void* memPtr)
{
U16 val16;
memcpy(&val16, memPtr, 2);
return val16;
}
static U16 LZ4_readLE16(const void* memPtr)
{
if (LZ4_isLittleEndian())
{
return LZ4_read16(memPtr);
}
else
{
const BYTE* p = (const BYTE*)memPtr;
return (U16)((U16)p[0] + (p[1]<<8));
}
}
static void LZ4_writeLE16(void* memPtr, U16 value)
{
if (LZ4_isLittleEndian())
{
memcpy(memPtr, &value, 2);
}
else
{
BYTE* p = (BYTE*)memPtr;
p[0] = (BYTE) value;
p[1] = (BYTE)(value>>8);
}
}
static U32 LZ4_read32(const void* memPtr)
{
U32 val32;
memcpy(&val32, memPtr, 4);
return val32;
}
static U64 LZ4_read64(const void* memPtr)
{
U64 val64;
memcpy(&val64, memPtr, 8);
return val64;
}
static size_t LZ4_read_ARCH(const void* p)
{
if (LZ4_64bits())
return (size_t)LZ4_read64(p);
else
return (size_t)LZ4_read32(p);
}
static void LZ4_copy4(void* dstPtr, const void* srcPtr) { memcpy(dstPtr, srcPtr, 4); }
static void LZ4_copy8(void* dstPtr, const void* srcPtr) { memcpy(dstPtr, srcPtr, 8); }
/* customized version of memcpy, which may overwrite up to 7 bytes beyond dstEnd */
static void LZ4_wildCopy(void* dstPtr, const void* srcPtr, void* dstEnd)
{
BYTE* d = (BYTE*)dstPtr;
const BYTE* s = (const BYTE*)srcPtr;
BYTE* e = (BYTE*)dstEnd;
do { LZ4_copy8(d,s); d+=8; s+=8; } while (d>3);
# elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT)
return (__builtin_ctzll((U64)val) >> 3);
# else
static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
# endif
}
else /* 32 bits */
{
# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
unsigned long r;
_BitScanForward( &r, (U32)val );
return (int)(r>>3);
# elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT)
return (__builtin_ctz((U32)val) >> 3);
# else
static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
# endif
}
}
else /* Big Endian CPU */
{
if (LZ4_64bits())
{
# if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT)
unsigned long r = 0;
_BitScanReverse64( &r, val );
return (unsigned)(r>>3);
# elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT)
return (__builtin_clzll((U64)val) >> 3);
# else
unsigned r;
if (!(val>>32)) { r=4; } else { r=0; val>>=32; }
if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
r += (!val);
return r;
# endif
}
else /* 32 bits */
{
# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
unsigned long r = 0;
_BitScanReverse( &r, (unsigned long)val );
return (unsigned)(r>>3);
# elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT)
return (__builtin_clz((U32)val) >> 3);
# else
unsigned r;
if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
r += (!val);
return r;
# endif
}
}
}
static unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit)
{
const BYTE* const pStart = pIn;
while (likely(pIn compression run slower on incompressible data */
/**************************************
* Local Structures and types
**************************************/
typedef struct {
U32 hashTable[HASH_SIZE_U32];
U32 currentOffset;
U32 initCheck;
const BYTE* dictionary;
BYTE* bufferStart; /* obsolete, used for slideInputBuffer */
U32 dictSize;
} LZ4_stream_t_internal;
typedef enum { notLimited = 0, limitedOutput = 1 } limitedOutput_directive;
typedef enum { byPtr, byU32, byU16 } tableType_t;
typedef enum { noDict = 0, withPrefix64k, usingExtDict } dict_directive;
typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive;
typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive;
typedef enum { full = 0, partial = 1 } earlyEnd_directive;
/**************************************
* Local Utils
**************************************/
int LZ4_versionNumber (void) { return LZ4_VERSION_NUMBER; }
int LZ4_compressBound(int isize) { return LZ4_COMPRESSBOUND(isize); }
int LZ4_sizeofState() { return LZ4_STREAMSIZE; }
/********************************
* Compression functions
********************************/
static U32 LZ4_hashSequence(U32 sequence, tableType_t const tableType)
{
if (tableType == byU16)
return (((sequence) * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1)));
else
return (((sequence) * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG));
}
static const U64 prime5bytes = 889523592379ULL;
static U32 LZ4_hashSequence64(size_t sequence, tableType_t const tableType)
{
const U32 hashLog = (tableType == byU16) ? LZ4_HASHLOG+1 : LZ4_HASHLOG;
const U32 hashMask = (1<> (40 - hashLog)) & hashMask;
}
static U32 LZ4_hashSequenceT(size_t sequence, tableType_t const tableType)
{
if (LZ4_64bits())
return LZ4_hashSequence64(sequence, tableType);
return LZ4_hashSequence((U32)sequence, tableType);
}
static U32 LZ4_hashPosition(const void* p, tableType_t tableType) { return LZ4_hashSequenceT(LZ4_read_ARCH(p), tableType); }
static void LZ4_putPositionOnHash(const BYTE* p, U32 h, void* tableBase, tableType_t const tableType, const BYTE* srcBase)
{
switch (tableType)
{
case byPtr: { const BYTE** hashTable = (const BYTE**)tableBase; hashTable[h] = p; return; }
case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = (U32)(p-srcBase); return; }
case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = (U16)(p-srcBase); return; }
}
}
static void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase)
{
U32 h = LZ4_hashPosition(p, tableType);
LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase);
}
static const BYTE* LZ4_getPositionOnHash(U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase)
{
if (tableType == byPtr) { const BYTE** hashTable = (const BYTE**) tableBase; return hashTable[h]; }
if (tableType == byU32) { U32* hashTable = (U32*) tableBase; return hashTable[h] + srcBase; }
{ U16* hashTable = (U16*) tableBase; return hashTable[h] + srcBase; } /* default, to ensure a return */
}
static const BYTE* LZ4_getPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase)
{
U32 h = LZ4_hashPosition(p, tableType);
return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase);
}
FORCE_INLINE int LZ4_compress_generic(
void* const ctx,
const char* const source,
char* const dest,
const int inputSize,
const int maxOutputSize,
const limitedOutput_directive outputLimited,
const tableType_t tableType,
const dict_directive dict,
const dictIssue_directive dictIssue,
const U32 acceleration)
{
LZ4_stream_t_internal* const dictPtr = (LZ4_stream_t_internal*)ctx;
const BYTE* ip = (const BYTE*) source;
const BYTE* base;
const BYTE* lowLimit;
const BYTE* const lowRefLimit = ip - dictPtr->dictSize;
const BYTE* const dictionary = dictPtr->dictionary;
const BYTE* const dictEnd = dictionary + dictPtr->dictSize;
const size_t dictDelta = dictEnd - (const BYTE*)source;
const BYTE* anchor = (const BYTE*) source;
const BYTE* const iend = ip + inputSize;
const BYTE* const mflimit = iend - MFLIMIT;
const BYTE* const matchlimit = iend - LASTLITERALS;
BYTE* op = (BYTE*) dest;
BYTE* const olimit = op + maxOutputSize;
U32 forwardH;
size_t refDelta=0;
/* Init conditions */
if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) return 0; /* Unsupported input size, too large (or negative) */
switch(dict)
{
case noDict:
default:
base = (const BYTE*)source;
lowLimit = (const BYTE*)source;
break;
case withPrefix64k:
base = (const BYTE*)source - dictPtr->currentOffset;
lowLimit = (const BYTE*)source - dictPtr->dictSize;
break;
case usingExtDict:
base = (const BYTE*)source - dictPtr->currentOffset;
lowLimit = (const BYTE*)source;
break;
}
if ((tableType == byU16) && (inputSize>=LZ4_64Klimit)) return 0; /* Size too large (not within 64K limit) */
if (inputSize> LZ4_skipTrigger);
if (unlikely(forwardIp > mflimit)) goto _last_literals;
match = LZ4_getPositionOnHash(h, ctx, tableType, base);
if (dict==usingExtDict)
{
if (match<(const BYTE*)source)
{
refDelta = dictDelta;
lowLimit = dictionary;
}
else
{
refDelta = 0;
lowLimit = (const BYTE*)source;
}
}
forwardH = LZ4_hashPosition(forwardIp, tableType);
LZ4_putPositionOnHash(ip, h, ctx, tableType, base);
} while ( ((dictIssue==dictSmall) ? (match < lowRefLimit) : 0)
|| ((tableType==byU16) ? 0 : (match + MAX_DISTANCE < ip))
|| (LZ4_read32(match+refDelta) != LZ4_read32(ip)) );
}
/* Catch up */
while ((ip>anchor) && (match+refDelta > lowLimit) && (unlikely(ip[-1]==match[refDelta-1]))) { ip--; match--; }
{
/* Encode Literal length */
unsigned litLength = (unsigned)(ip - anchor);
token = op++;
if ((outputLimited) && (unlikely(op + litLength + (2 + 1 + LASTLITERALS) + (litLength/255) > olimit)))
return 0; /* Check output limit */
if (litLength>=RUN_MASK)
{
int len = (int)litLength-RUN_MASK;
*token=(RUN_MASK<= 255 ; len-=255) *op++ = 255;
*op++ = (BYTE)len;
}
else *token = (BYTE)(litLength< matchlimit) limit = matchlimit;
matchLength = LZ4_count(ip+MINMATCH, match+MINMATCH, limit);
ip += MINMATCH + matchLength;
if (ip==limit)
{
unsigned more = LZ4_count(ip, (const BYTE*)source, matchlimit);
matchLength += more;
ip += more;
}
}
else
{
matchLength = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit);
ip += MINMATCH + matchLength;
}
if ((outputLimited) && (unlikely(op + (1 + LASTLITERALS) + (matchLength>>8) > olimit)))
return 0; /* Check output limit */
if (matchLength>=ML_MASK)
{
*token += ML_MASK;
matchLength -= ML_MASK;
for (; matchLength >= 510 ; matchLength-=510) { *op++ = 255; *op++ = 255; }
if (matchLength >= 255) { matchLength-=255; *op++ = 255; }
*op++ = (BYTE)matchLength;
}
else *token += (BYTE)(matchLength);
}
anchor = ip;
/* Test end of chunk */
if (ip > mflimit) break;
/* Fill table */
LZ4_putPosition(ip-2, ctx, tableType, base);
/* Test next position */
match = LZ4_getPosition(ip, ctx, tableType, base);
if (dict==usingExtDict)
{
if (match<(const BYTE*)source)
{
refDelta = dictDelta;
lowLimit = dictionary;
}
else
{
refDelta = 0;
lowLimit = (const BYTE*)source;
}
}
LZ4_putPosition(ip, ctx, tableType, base);
if ( ((dictIssue==dictSmall) ? (match>=lowRefLimit) : 1)
&& (match+MAX_DISTANCE>=ip)
&& (LZ4_read32(match+refDelta)==LZ4_read32(ip)) )
{ token=op++; *token=0; goto _next_match; }
/* Prepare next loop */
forwardH = LZ4_hashPosition(++ip, tableType);
}
_last_literals:
/* Encode Last Literals */
{
const size_t lastRun = (size_t)(iend - anchor);
if ((outputLimited) && ((op - (BYTE*)dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize))
return 0; /* Check output limit */
if (lastRun >= RUN_MASK)
{
size_t accumulator = lastRun - RUN_MASK;
*op++ = RUN_MASK << ML_BITS;
for(; accumulator >= 255 ; accumulator-=255) *op++ = 255;
*op++ = (BYTE) accumulator;
}
else
{
*op++ = (BYTE)(lastRun<= LZ4_compressBound(inputSize))
{
if (inputSize < LZ4_64Klimit)
return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, byU16, noDict, noDictIssue, acceleration);
else
return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration);
}
else
{
if (inputSize < LZ4_64Klimit)
return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue, acceleration);
else
return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration);
}
}
int LZ4_compress_fast(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
{
#if (HEAPMODE)
void* ctxPtr = ALLOCATOR(1, sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */
#else
LZ4_stream_t ctx;
void* ctxPtr = &ctx;
#endif
int result = LZ4_compress_fast_extState(ctxPtr, source, dest, inputSize, maxOutputSize, acceleration);
#if (HEAPMODE)
FREEMEM(ctxPtr);
#endif
return result;
}
int LZ4_compress_default(const char* source, char* dest, int inputSize, int maxOutputSize)
{
return LZ4_compress_fast(source, dest, inputSize, maxOutputSize, 1);
}
/* hidden debug function */
/* strangely enough, gcc generates faster code when this function is uncommented, even if unused */
int LZ4_compress_fast_force(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
{
LZ4_stream_t ctx;
LZ4_resetStream(&ctx);
if (inputSize < LZ4_64Klimit)
return LZ4_compress_generic(&ctx, source, dest, inputSize, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue, acceleration);
else
return LZ4_compress_generic(&ctx, source, dest, inputSize, maxOutputSize, limitedOutput, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration);
}
/********************************
* destSize variant
********************************/
static int LZ4_compress_destSize_generic(
void* const ctx,
const char* const src,
char* const dst,
int* const srcSizePtr,
const int targetDstSize,
const tableType_t tableType)
{
const BYTE* ip = (const BYTE*) src;
const BYTE* base = (const BYTE*) src;
const BYTE* lowLimit = (const BYTE*) src;
const BYTE* anchor = ip;
const BYTE* const iend = ip + *srcSizePtr;
const BYTE* const mflimit = iend - MFLIMIT;
const BYTE* const matchlimit = iend - LASTLITERALS;
BYTE* op = (BYTE*) dst;
BYTE* const oend = op + targetDstSize;
BYTE* const oMaxLit = op + targetDstSize - 2 /* offset */ - 8 /* because 8+MINMATCH==MFLIMIT */ - 1 /* token */;
BYTE* const oMaxMatch = op + targetDstSize - (LASTLITERALS + 1 /* token */);
BYTE* const oMaxSeq = oMaxLit - 1 /* token */;
U32 forwardH;
/* Init conditions */
if (targetDstSize < 1) return 0; /* Impossible to store anything */
if ((U32)*srcSizePtr > (U32)LZ4_MAX_INPUT_SIZE) return 0; /* Unsupported input size, too large (or negative) */
if ((tableType == byU16) && (*srcSizePtr>=LZ4_64Klimit)) return 0; /* Size too large (not within 64K limit) */
if (*srcSizePtr> LZ4_skipTrigger);
if (unlikely(forwardIp > mflimit))
goto _last_literals;
match = LZ4_getPositionOnHash(h, ctx, tableType, base);
forwardH = LZ4_hashPosition(forwardIp, tableType);
LZ4_putPositionOnHash(ip, h, ctx, tableType, base);
} while ( ((tableType==byU16) ? 0 : (match + MAX_DISTANCE < ip))
|| (LZ4_read32(match) != LZ4_read32(ip)) );
}
/* Catch up */
while ((ip>anchor) && (match > lowLimit) && (unlikely(ip[-1]==match[-1]))) { ip--; match--; }
{
/* Encode Literal length */
unsigned litLength = (unsigned)(ip - anchor);
token = op++;
if (op + ((litLength+240)/255) + litLength > oMaxLit)
{
/* Not enough space for a last match */
op--;
goto _last_literals;
}
if (litLength>=RUN_MASK)
{
unsigned len = litLength - RUN_MASK;
*token=(RUN_MASK<= 255 ; len-=255) *op++ = 255;
*op++ = (BYTE)len;
}
else *token = (BYTE)(litLength< oMaxMatch)
{
/* Match description too long : reduce it */
matchLength = (15-1) + (oMaxMatch-op) * 255;
}
//printf("offset %5i, matchLength%5i \n", (int)(ip-match), matchLength + MINMATCH);
ip += MINMATCH + matchLength;
if (matchLength>=ML_MASK)
{
*token += ML_MASK;
matchLength -= ML_MASK;
while (matchLength >= 255) { matchLength-=255; *op++ = 255; }
*op++ = (BYTE)matchLength;
}
else *token += (BYTE)(matchLength);
}
anchor = ip;
/* Test end of block */
if (ip > mflimit) break;
if (op > oMaxSeq) break;
/* Fill table */
LZ4_putPosition(ip-2, ctx, tableType, base);
/* Test next position */
match = LZ4_getPosition(ip, ctx, tableType, base);
LZ4_putPosition(ip, ctx, tableType, base);
if ( (match+MAX_DISTANCE>=ip)
&& (LZ4_read32(match)==LZ4_read32(ip)) )
{ token=op++; *token=0; goto _next_match; }
/* Prepare next loop */
forwardH = LZ4_hashPosition(++ip, tableType);
}
_last_literals:
/* Encode Last Literals */
{
size_t lastRunSize = (size_t)(iend - anchor);
if (op + 1 /* token */ + ((lastRunSize+240)/255) /* litLength */ + lastRunSize /* literals */ > oend)
{
/* adapt lastRunSize to fill 'dst' */
lastRunSize = (oend-op) - 1;
lastRunSize -= (lastRunSize+240)/255;
}
ip = anchor + lastRunSize;
if (lastRunSize >= RUN_MASK)
{
size_t accumulator = lastRunSize - RUN_MASK;
*op++ = RUN_MASK << ML_BITS;
for(; accumulator >= 255 ; accumulator-=255) *op++ = 255;
*op++ = (BYTE) accumulator;
}
else
{
*op++ = (BYTE)(lastRunSize<= LZ4_compressBound(*srcSizePtr)) /* compression success is guaranteed */
{
return LZ4_compress_fast_extState(state, src, dst, *srcSizePtr, targetDstSize, 1);
}
else
{
if (*srcSizePtr < LZ4_64Klimit)
return LZ4_compress_destSize_generic(state, src, dst, srcSizePtr, targetDstSize, byU16);
else
return LZ4_compress_destSize_generic(state, src, dst, srcSizePtr, targetDstSize, LZ4_64bits() ? byU32 : byPtr);
}
}
int LZ4_compress_destSize(const char* src, char* dst, int* srcSizePtr, int targetDstSize)
{
#if (HEAPMODE)
void* ctx = ALLOCATOR(1, sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */
#else
LZ4_stream_t ctxBody;
void* ctx = &ctxBody;
#endif
int result = LZ4_compress_destSize_extState(ctx, src, dst, srcSizePtr, targetDstSize);
#if (HEAPMODE)
FREEMEM(ctx);
#endif
return result;
}
/********************************
* Streaming functions
********************************/
LZ4_stream_t* LZ4_createStream(void)
{
LZ4_stream_t* lz4s = (LZ4_stream_t*)ALLOCATOR(8, LZ4_STREAMSIZE_U64);
LZ4_STATIC_ASSERT(LZ4_STREAMSIZE >= sizeof(LZ4_stream_t_internal)); /* A compilation error here means LZ4_STREAMSIZE is not large enough */
LZ4_resetStream(lz4s);
return lz4s;
}
void LZ4_resetStream (LZ4_stream_t* LZ4_stream)
{
MEM_INIT(LZ4_stream, 0, sizeof(LZ4_stream_t));
}
int LZ4_freeStream (LZ4_stream_t* LZ4_stream)
{
FREEMEM(LZ4_stream);
return (0);
}
#define HASH_UNIT sizeof(size_t)
int LZ4_loadDict (LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize)
{
LZ4_stream_t_internal* dict = (LZ4_stream_t_internal*) LZ4_dict;
const BYTE* p = (const BYTE*)dictionary;
const BYTE* const dictEnd = p + dictSize;
const BYTE* base;
if ((dict->initCheck) || (dict->currentOffset > 1 GB)) /* Uninitialized structure, or reuse overflow */
LZ4_resetStream(LZ4_dict);
if (dictSize < (int)HASH_UNIT)
{
dict->dictionary = NULL;
dict->dictSize = 0;
return 0;
}
if ((dictEnd - p) > 64 KB) p = dictEnd - 64 KB;
dict->currentOffset += 64 KB;
base = p - dict->currentOffset;
dict->dictionary = p;
dict->dictSize = (U32)(dictEnd - p);
dict->currentOffset += dict->dictSize;
while (p <= dictEnd-HASH_UNIT)
{
LZ4_putPosition(p, dict->hashTable, byU32, base);
p+=3;
}
return dict->dictSize;
}
static void LZ4_renormDictT(LZ4_stream_t_internal* LZ4_dict, const BYTE* src)
{
if ((LZ4_dict->currentOffset > 0x80000000) ||
((size_t)LZ4_dict->currentOffset > (size_t)src)) /* address space overflow */
{
/* rescale hash table */
U32 delta = LZ4_dict->currentOffset - 64 KB;
const BYTE* dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize;
int i;
for (i=0; ihashTable[i] < delta) LZ4_dict->hashTable[i]=0;
else LZ4_dict->hashTable[i] -= delta;
}
LZ4_dict->currentOffset = 64 KB;
if (LZ4_dict->dictSize > 64 KB) LZ4_dict->dictSize = 64 KB;
LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize;
}
}
int LZ4_compress_fast_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
{
LZ4_stream_t_internal* streamPtr = (LZ4_stream_t_internal*)LZ4_stream;
const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize;
const BYTE* smallest = (const BYTE*) source;
if (streamPtr->initCheck) return 0; /* Uninitialized structure detected */
if ((streamPtr->dictSize>0) && (smallest>dictEnd)) smallest = dictEnd;
LZ4_renormDictT(streamPtr, smallest);
if (acceleration < 1) acceleration = ACCELERATION_DEFAULT;
/* Check overlapping input/dictionary space */
{
const BYTE* sourceEnd = (const BYTE*) source + inputSize;
if ((sourceEnd > streamPtr->dictionary) && (sourceEnd < dictEnd))
{
streamPtr->dictSize = (U32)(dictEnd - sourceEnd);
if (streamPtr->dictSize > 64 KB) streamPtr->dictSize = 64 KB;
if (streamPtr->dictSize < 4) streamPtr->dictSize = 0;
streamPtr->dictionary = dictEnd - streamPtr->dictSize;
}
}
/* prefix mode : source data follows dictionary */
if (dictEnd == (const BYTE*)source)
{
int result;
if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset))
result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, withPrefix64k, dictSmall, acceleration);
else
result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, withPrefix64k, noDictIssue, acceleration);
streamPtr->dictSize += (U32)inputSize;
streamPtr->currentOffset += (U32)inputSize;
return result;
}
/* external dictionary mode */
{
int result;
if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset))
result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, usingExtDict, dictSmall, acceleration);
else
result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, usingExtDict, noDictIssue, acceleration);
streamPtr->dictionary = (const BYTE*)source;
streamPtr->dictSize = (U32)inputSize;
streamPtr->currentOffset += (U32)inputSize;
return result;
}
}
/* Hidden debug function, to force external dictionary mode */
int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int inputSize)
{
LZ4_stream_t_internal* streamPtr = (LZ4_stream_t_internal*)LZ4_dict;
int result;
const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize;
const BYTE* smallest = dictEnd;
if (smallest > (const BYTE*) source) smallest = (const BYTE*) source;
LZ4_renormDictT((LZ4_stream_t_internal*)LZ4_dict, smallest);
result = LZ4_compress_generic(LZ4_dict, source, dest, inputSize, 0, notLimited, byU32, usingExtDict, noDictIssue, 1);
streamPtr->dictionary = (const BYTE*)source;
streamPtr->dictSize = (U32)inputSize;
streamPtr->currentOffset += (U32)inputSize;
return result;
}
int LZ4_saveDict (LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize)
{
LZ4_stream_t_internal* dict = (LZ4_stream_t_internal*) LZ4_dict;
const BYTE* previousDictEnd = dict->dictionary + dict->dictSize;
if ((U32)dictSize > 64 KB) dictSize = 64 KB; /* useless to define a dictionary > 64 KB */
if ((U32)dictSize > dict->dictSize) dictSize = dict->dictSize;
memmove(safeBuffer, previousDictEnd - dictSize, dictSize);
dict->dictionary = (const BYTE*)safeBuffer;
dict->dictSize = (U32)dictSize;
return dictSize;
}
/*******************************
* Decompression functions
*******************************/
/*
* This generic decompression function cover all use cases.
* It shall be instantiated several times, using different sets of directives
* Note that it is essential this generic function is really inlined,
* in order to remove useless branches during compilation optimization.
*/
FORCE_INLINE int LZ4_decompress_generic(
const char* const source,
char* const dest,
int inputSize,
int outputSize, /* If endOnInput==endOnInputSize, this value is the max size of Output Buffer. */
int endOnInput, /* endOnOutputSize, endOnInputSize */
int partialDecoding, /* full, partial */
int targetOutputSize, /* only used if partialDecoding==partial */
int dict, /* noDict, withPrefix64k, usingExtDict */
const BYTE* const lowPrefix, /* == dest if dict == noDict */
const BYTE* const dictStart, /* only if dict==usingExtDict */
const size_t dictSize /* note : = 0 if noDict */
)
{
/* Local Variables */
const BYTE* ip = (const BYTE*) source;
const BYTE* const iend = ip + inputSize;
BYTE* op = (BYTE*) dest;
BYTE* const oend = op + outputSize;
BYTE* cpy;
BYTE* oexit = op + targetOutputSize;
const BYTE* const lowLimit = lowPrefix - dictSize;
const BYTE* const dictEnd = (const BYTE*)dictStart + dictSize;
const size_t dec32table[] = {4, 1, 2, 1, 4, 4, 4, 4};
const size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3};
const int safeDecode = (endOnInput==endOnInputSize);
const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB)));
/* Special cases */
if ((partialDecoding) && (oexit> oend-MFLIMIT)) oexit = oend-MFLIMIT; /* targetOutputSize too high => decode everything */
if ((endOnInput) && (unlikely(outputSize==0))) return ((inputSize==1) && (*ip==0)) ? 0 : -1; /* Empty output buffer */
if ((!endOnInput) && (unlikely(outputSize==0))) return (*ip==0?1:-1);
/* Main Loop */
while (1)
{
unsigned token;
size_t length;
const BYTE* match;
/* get literal length */
token = *ip++;
if ((length=(token>>ML_BITS)) == RUN_MASK)
{
unsigned s;
do
{
s = *ip++;
length += s;
}
while (likely((endOnInput)?ip(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) )
|| ((!endOnInput) && (cpy>oend-COPYLENGTH)))
{
if (partialDecoding)
{
if (cpy > oend) goto _output_error; /* Error : write attempt beyond end of output buffer */
if ((endOnInput) && (ip+length > iend)) goto _output_error; /* Error : read attempt beyond end of input buffer */
}
else
{
if ((!endOnInput) && (cpy != oend)) goto _output_error; /* Error : block decoding must stop exactly there */
if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error; /* Error : input must be consumed */
}
memcpy(op, ip, length);
ip += length;
op += length;
break; /* Necessarily EOF, due to parsing restrictions */
}
LZ4_wildCopy(op, ip, cpy);
ip += length; op = cpy;
/* get offset */
match = cpy - LZ4_readLE16(ip); ip+=2;
if ((checkOffset) && (unlikely(match < lowLimit))) goto _output_error; /* Error : offset outside destination buffer */
/* get matchlength */
length = token & ML_MASK;
if (length == ML_MASK)
{
unsigned s;
do
{
if ((endOnInput) && (ip > iend-LASTLITERALS)) goto _output_error;
s = *ip++;
length += s;
} while (s==255);
if ((safeDecode) && unlikely((size_t)(op+length)<(size_t)op)) goto _output_error; /* overflow detection */
}
length += MINMATCH;
/* check external dictionary */
if ((dict==usingExtDict) && (match < lowPrefix))
{
if (unlikely(op+length > oend-LASTLITERALS)) goto _output_error; /* doesn't respect parsing restriction */
if (length <= (size_t)(lowPrefix-match))
{
/* match can be copied as a single segment from external dictionary */
match = dictEnd - (lowPrefix-match);
memmove(op, match, length); op += length;
}
else
{
/* match encompass external dictionary and current segment */
size_t copySize = (size_t)(lowPrefix-match);
memcpy(op, dictEnd - copySize, copySize);
op += copySize;
copySize = length - copySize;
if (copySize > (size_t)(op-lowPrefix)) /* overlap within current segment */
{
BYTE* const endOfMatch = op + copySize;
const BYTE* copyFrom = lowPrefix;
while (op < endOfMatch) *op++ = *copyFrom++;
}
else
{
memcpy(op, lowPrefix, copySize);
op += copySize;
}
}
continue;
}
/* copy repeated sequence */
cpy = op + length;
if (unlikely((op-match)<8))
{
const size_t dec64 = dec64table[op-match];
op[0] = match[0];
op[1] = match[1];
op[2] = match[2];
op[3] = match[3];
match += dec32table[op-match];
LZ4_copy4(op+4, match);
op += 8; match -= dec64;
} else { LZ4_copy8(op, match); op+=8; match+=8; }
if (unlikely(cpy>oend-12))
{
if (cpy > oend-LASTLITERALS) goto _output_error; /* Error : last LASTLITERALS bytes must be literals */
if (op < oend-8)
{
LZ4_wildCopy(op, match, oend-8);
match += (oend-8) - op;
op = oend-8;
}
while (opprefixSize = (size_t) dictSize;
lz4sd->prefixEnd = (const BYTE*) dictionary + dictSize;
lz4sd->externalDict = NULL;
lz4sd->extDictSize = 0;
return 1;
}
/*
*_continue() :
These decoding functions allow decompression of multiple blocks in "streaming" mode.
Previously decoded blocks must still be available at the memory position where they were decoded.
If it's not possible, save the relevant part of decoded data into a safe buffer,
and indicate where it stands using LZ4_setStreamDecode()
*/
int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize)
{
LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode;
int result;
if (lz4sd->prefixEnd == (BYTE*)dest)
{
result = LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
endOnInputSize, full, 0,
usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize);
if (result <= 0) return result;
lz4sd->prefixSize += result;
lz4sd->prefixEnd += result;
}
else
{
lz4sd->extDictSize = lz4sd->prefixSize;
lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize;
result = LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
endOnInputSize, full, 0,
usingExtDict, (BYTE*)dest, lz4sd->externalDict, lz4sd->extDictSize);
if (result <= 0) return result;
lz4sd->prefixSize = result;
lz4sd->prefixEnd = (BYTE*)dest + result;
}
return result;
}
int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize)
{
LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode;
int result;
if (lz4sd->prefixEnd == (BYTE*)dest)
{
result = LZ4_decompress_generic(source, dest, 0, originalSize,
endOnOutputSize, full, 0,
usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize);
if (result <= 0) return result;
lz4sd->prefixSize += originalSize;
lz4sd->prefixEnd += originalSize;
}
else
{
lz4sd->extDictSize = lz4sd->prefixSize;
lz4sd->externalDict = (BYTE*)dest - lz4sd->extDictSize;
result = LZ4_decompress_generic(source, dest, 0, originalSize,
endOnOutputSize, full, 0,
usingExtDict, (BYTE*)dest, lz4sd->externalDict, lz4sd->extDictSize);
if (result <= 0) return result;
lz4sd->prefixSize = originalSize;
lz4sd->prefixEnd = (BYTE*)dest + originalSize;
}
return result;
}
/*
Advanced decoding functions :
*_usingDict() :
These decoding functions work the same as "_continue" ones,
the dictionary must be explicitly provided within parameters
*/
FORCE_INLINE int LZ4_decompress_usingDict_generic(const char* source, char* dest, int compressedSize, int maxOutputSize, int safe, const char* dictStart, int dictSize)
{
if (dictSize==0)
return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, noDict, (BYTE*)dest, NULL, 0);
if (dictStart+dictSize == dest)
{
if (dictSize >= (int)(64 KB - 1))
return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, withPrefix64k, (BYTE*)dest-64 KB, NULL, 0);
return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, noDict, (BYTE*)dest-dictSize, NULL, 0);
}
return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, usingExtDict, (BYTE*)dest, (const BYTE*)dictStart, dictSize);
}
int LZ4_decompress_safe_usingDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize)
{
return LZ4_decompress_usingDict_generic(source, dest, compressedSize, maxOutputSize, 1, dictStart, dictSize);
}
int LZ4_decompress_fast_usingDict(const char* source, char* dest, int originalSize, const char* dictStart, int dictSize)
{
return LZ4_decompress_usingDict_generic(source, dest, 0, originalSize, 0, dictStart, dictSize);
}
/* debug function */
int LZ4_decompress_safe_forceExtDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize)
{
return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, usingExtDict, (BYTE*)dest, (const BYTE*)dictStart, dictSize);
}
/***************************************************
* Obsolete Functions
***************************************************/
/* obsolete compression functions */
int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) { return LZ4_compress_default(source, dest, inputSize, maxOutputSize); }
int LZ4_compress(const char* source, char* dest, int inputSize) { return LZ4_compress_default(source, dest, inputSize, LZ4_compressBound(inputSize)); }
int LZ4_compress_limitedOutput_withState (void* state, const char* src, char* dst, int srcSize, int dstSize) { return LZ4_compress_fast_extState(state, src, dst, srcSize, dstSize, 1); }
int LZ4_compress_withState (void* state, const char* src, char* dst, int srcSize) { return LZ4_compress_fast_extState(state, src, dst, srcSize, LZ4_compressBound(srcSize), 1); }
int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_stream, const char* src, char* dst, int srcSize, int maxDstSize) { return LZ4_compress_fast_continue(LZ4_stream, src, dst, srcSize, maxDstSize, 1); }
int LZ4_compress_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize) { return LZ4_compress_fast_continue(LZ4_stream, source, dest, inputSize, LZ4_compressBound(inputSize), 1); }
/*
These function names are deprecated and should no longer be used.
They are only provided here for compatibility with older user programs.
- LZ4_uncompress is totally equivalent to LZ4_decompress_fast
- LZ4_uncompress_unknownOutputSize is totally equivalent to LZ4_decompress_safe
*/
int LZ4_uncompress (const char* source, char* dest, int outputSize) { return LZ4_decompress_fast(source, dest, outputSize); }
int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize) { return LZ4_decompress_safe(source, dest, isize, maxOutputSize); }
/* Obsolete Streaming functions */
int LZ4_sizeofStreamState() { return LZ4_STREAMSIZE; }
static void LZ4_init(LZ4_stream_t_internal* lz4ds, BYTE* base)
{
MEM_INIT(lz4ds, 0, LZ4_STREAMSIZE);
lz4ds->bufferStart = base;
}
int LZ4_resetStreamState(void* state, char* inputBuffer)
{
if ((((size_t)state) & 3) != 0) return 1; /* Error : pointer is not aligned on 4-bytes boundary */
LZ4_init((LZ4_stream_t_internal*)state, (BYTE*)inputBuffer);
return 0;
}
void* LZ4_create (char* inputBuffer)
{
void* lz4ds = ALLOCATOR(8, LZ4_STREAMSIZE_U64);
LZ4_init ((LZ4_stream_t_internal*)lz4ds, (BYTE*)inputBuffer);
return lz4ds;
}
char* LZ4_slideInputBuffer (void* LZ4_Data)
{
LZ4_stream_t_internal* ctx = (LZ4_stream_t_internal*)LZ4_Data;
int dictSize = LZ4_saveDict((LZ4_stream_t*)LZ4_Data, (char*)ctx->bufferStart, 64 KB);
return (char*)(ctx->bufferStart + dictSize);
}
/* Obsolete streaming decompression functions */
int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int compressedSize, int maxOutputSize)
{
return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, withPrefix64k, (BYTE*)dest - 64 KB, NULL, 64 KB);
}
int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int originalSize)
{
return LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, withPrefix64k, (BYTE*)dest - 64 KB, NULL, 64 KB);
}
#endif /* LZ4_COMMONDEFS_ONLY */
bitshuffle-0.3.5/lz4/lz4.h000066400000000000000000000446161337005776700152770ustar00rootroot00000000000000/*
LZ4 - Fast LZ compression algorithm
Header File
Copyright (C) 2011-2015, Yann Collet.
BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
You can contact the author at :
- LZ4 source repository : https://github.com/Cyan4973/lz4
- LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
*/
#pragma once
#if defined (__cplusplus)
extern "C" {
#endif
/*
* lz4.h provides block compression functions, and gives full buffer control to programmer.
* If you need to generate inter-operable compressed data (respecting LZ4 frame specification),
* and can let the library handle its own memory, please use lz4frame.h instead.
*/
/**************************************
* Version
**************************************/
#define LZ4_VERSION_MAJOR 1 /* for breaking interface changes */
#define LZ4_VERSION_MINOR 7 /* for new (non-breaking) interface capabilities */
#define LZ4_VERSION_RELEASE 1 /* for tweaks, bug-fixes, or development */
#define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE)
int LZ4_versionNumber (void);
/**************************************
* Tuning parameter
**************************************/
/*
* LZ4_MEMORY_USAGE :
* Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
* Increasing memory usage improves compression ratio
* Reduced memory usage can improve speed, due to cache effect
* Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache
*/
#define LZ4_MEMORY_USAGE 14
/**************************************
* Simple Functions
**************************************/
int LZ4_compress_default(const char* source, char* dest, int sourceSize, int maxDestSize);
int LZ4_decompress_safe (const char* source, char* dest, int compressedSize, int maxDecompressedSize);
/*
LZ4_compress_default() :
Compresses 'sourceSize' bytes from buffer 'source'
into already allocated 'dest' buffer of size 'maxDestSize'.
Compression is guaranteed to succeed if 'maxDestSize' >= LZ4_compressBound(sourceSize).
It also runs faster, so it's a recommended setting.
If the function cannot compress 'source' into a more limited 'dest' budget,
compression stops *immediately*, and the function result is zero.
As a consequence, 'dest' content is not valid.
This function never writes outside 'dest' buffer, nor read outside 'source' buffer.
sourceSize : Max supported value is LZ4_MAX_INPUT_VALUE
maxDestSize : full or partial size of buffer 'dest' (which must be already allocated)
return : the number of bytes written into buffer 'dest' (necessarily <= maxOutputSize)
or 0 if compression fails
LZ4_decompress_safe() :
compressedSize : is the precise full size of the compressed block.
maxDecompressedSize : is the size of destination buffer, which must be already allocated.
return : the number of bytes decompressed into destination buffer (necessarily <= maxDecompressedSize)
If destination buffer is not large enough, decoding will stop and output an error code (<0).
If the source stream is detected malformed, the function will stop decoding and return a negative result.
This function is protected against buffer overflow exploits, including malicious data packets.
It never writes outside output buffer, nor reads outside input buffer.
*/
/**************************************
* Advanced Functions
**************************************/
#define LZ4_MAX_INPUT_SIZE 0x7E000000 /* 2 113 929 216 bytes */
#define LZ4_COMPRESSBOUND(isize) ((unsigned)(isize) > (unsigned)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16)
/*
LZ4_compressBound() :
Provides the maximum size that LZ4 compression may output in a "worst case" scenario (input data not compressible)
This function is primarily useful for memory allocation purposes (destination buffer size).
Macro LZ4_COMPRESSBOUND() is also provided for compilation-time evaluation (stack memory allocation for example).
Note that LZ4_compress_default() compress faster when dest buffer size is >= LZ4_compressBound(srcSize)
inputSize : max supported value is LZ4_MAX_INPUT_SIZE
return : maximum output size in a "worst case" scenario
or 0, if input size is too large ( > LZ4_MAX_INPUT_SIZE)
*/
int LZ4_compressBound(int inputSize);
/*
LZ4_compress_fast() :
Same as LZ4_compress_default(), but allows to select an "acceleration" factor.
The larger the acceleration value, the faster the algorithm, but also the lesser the compression.
It's a trade-off. It can be fine tuned, with each successive value providing roughly +~3% to speed.
An acceleration value of "1" is the same as regular LZ4_compress_default()
Values <= 0 will be replaced by ACCELERATION_DEFAULT (see lz4.c), which is 1.
*/
int LZ4_compress_fast (const char* source, char* dest, int sourceSize, int maxDestSize, int acceleration);
/*
LZ4_compress_fast_extState() :
Same compression function, just using an externally allocated memory space to store compression state.
Use LZ4_sizeofState() to know how much memory must be allocated,
and allocate it on 8-bytes boundaries (using malloc() typically).
Then, provide it as 'void* state' to compression function.
*/
int LZ4_sizeofState(void);
int LZ4_compress_fast_extState (void* state, const char* source, char* dest, int inputSize, int maxDestSize, int acceleration);
/*
LZ4_compress_destSize() :
Reverse the logic, by compressing as much data as possible from 'source' buffer
into already allocated buffer 'dest' of size 'targetDestSize'.
This function either compresses the entire 'source' content into 'dest' if it's large enough,
or fill 'dest' buffer completely with as much data as possible from 'source'.
*sourceSizePtr : will be modified to indicate how many bytes where read from 'source' to fill 'dest'.
New value is necessarily <= old value.
return : Nb bytes written into 'dest' (necessarily <= targetDestSize)
or 0 if compression fails
*/
int LZ4_compress_destSize (const char* source, char* dest, int* sourceSizePtr, int targetDestSize);
/*
LZ4_decompress_fast() :
originalSize : is the original and therefore uncompressed size
return : the number of bytes read from the source buffer (in other words, the compressed size)
If the source stream is detected malformed, the function will stop decoding and return a negative result.
Destination buffer must be already allocated. Its size must be a minimum of 'originalSize' bytes.
note : This function fully respect memory boundaries for properly formed compressed data.
It is a bit faster than LZ4_decompress_safe().
However, it does not provide any protection against intentionally modified data stream (malicious input).
Use this function in trusted environment only (data to decode comes from a trusted source).
*/
int LZ4_decompress_fast (const char* source, char* dest, int originalSize);
/*
LZ4_decompress_safe_partial() :
This function decompress a compressed block of size 'compressedSize' at position 'source'
into destination buffer 'dest' of size 'maxDecompressedSize'.
The function tries to stop decompressing operation as soon as 'targetOutputSize' has been reached,
reducing decompression time.
return : the number of bytes decoded in the destination buffer (necessarily <= maxDecompressedSize)
Note : this number can be < 'targetOutputSize' should the compressed block to decode be smaller.
Always control how many bytes were decoded.
If the source stream is detected malformed, the function will stop decoding and return a negative result.
This function never writes outside of output buffer, and never reads outside of input buffer. It is therefore protected against malicious data packets
*/
int LZ4_decompress_safe_partial (const char* source, char* dest, int compressedSize, int targetOutputSize, int maxDecompressedSize);
/***********************************************
* Streaming Compression Functions
***********************************************/
#define LZ4_STREAMSIZE_U64 ((1 << (LZ4_MEMORY_USAGE-3)) + 4)
#define LZ4_STREAMSIZE (LZ4_STREAMSIZE_U64 * sizeof(long long))
/*
* LZ4_stream_t
* information structure to track an LZ4 stream.
* important : init this structure content before first use !
* note : only allocated directly the structure if you are statically linking LZ4
* If you are using liblz4 as a DLL, please use below construction methods instead.
*/
typedef struct { long long table[LZ4_STREAMSIZE_U64]; } LZ4_stream_t;
/*
* LZ4_resetStream
* Use this function to init an allocated LZ4_stream_t structure
*/
void LZ4_resetStream (LZ4_stream_t* streamPtr);
/*
* LZ4_createStream will allocate and initialize an LZ4_stream_t structure
* LZ4_freeStream releases its memory.
* In the context of a DLL (liblz4), please use these methods rather than the static struct.
* They are more future proof, in case of a change of LZ4_stream_t size.
*/
LZ4_stream_t* LZ4_createStream(void);
int LZ4_freeStream (LZ4_stream_t* streamPtr);
/*
* LZ4_loadDict
* Use this function to load a static dictionary into LZ4_stream.
* Any previous data will be forgotten, only 'dictionary' will remain in memory.
* Loading a size of 0 is allowed.
* Return : dictionary size, in bytes (necessarily <= 64 KB)
*/
int LZ4_loadDict (LZ4_stream_t* streamPtr, const char* dictionary, int dictSize);
/*
* LZ4_compress_fast_continue
* Compress buffer content 'src', using data from previously compressed blocks as dictionary to improve compression ratio.
* Important : Previous data blocks are assumed to still be present and unmodified !
* 'dst' buffer must be already allocated.
* If maxDstSize >= LZ4_compressBound(srcSize), compression is guaranteed to succeed, and runs faster.
* If not, and if compressed data cannot fit into 'dst' buffer size, compression stops, and function returns a zero.
*/
int LZ4_compress_fast_continue (LZ4_stream_t* streamPtr, const char* src, char* dst, int srcSize, int maxDstSize, int acceleration);
/*
* LZ4_saveDict
* If previously compressed data block is not guaranteed to remain available at its memory location
* save it into a safer place (char* safeBuffer)
* Note : you don't need to call LZ4_loadDict() afterwards,
* dictionary is immediately usable, you can therefore call LZ4_compress_fast_continue()
* Return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error
*/
int LZ4_saveDict (LZ4_stream_t* streamPtr, char* safeBuffer, int dictSize);
/************************************************
* Streaming Decompression Functions
************************************************/
#define LZ4_STREAMDECODESIZE_U64 4
#define LZ4_STREAMDECODESIZE (LZ4_STREAMDECODESIZE_U64 * sizeof(unsigned long long))
typedef struct { unsigned long long table[LZ4_STREAMDECODESIZE_U64]; } LZ4_streamDecode_t;
/*
* LZ4_streamDecode_t
* information structure to track an LZ4 stream.
* init this structure content using LZ4_setStreamDecode or memset() before first use !
*
* In the context of a DLL (liblz4) please prefer usage of construction methods below.
* They are more future proof, in case of a change of LZ4_streamDecode_t size in the future.
* LZ4_createStreamDecode will allocate and initialize an LZ4_streamDecode_t structure
* LZ4_freeStreamDecode releases its memory.
*/
LZ4_streamDecode_t* LZ4_createStreamDecode(void);
int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream);
/*
* LZ4_setStreamDecode
* Use this function to instruct where to find the dictionary.
* Setting a size of 0 is allowed (same effect as reset).
* Return : 1 if OK, 0 if error
*/
int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize);
/*
*_continue() :
These decoding functions allow decompression of multiple blocks in "streaming" mode.
Previously decoded blocks *must* remain available at the memory position where they were decoded (up to 64 KB)
In the case of a ring buffers, decoding buffer must be either :
- Exactly same size as encoding buffer, with same update rule (block boundaries at same positions)
In which case, the decoding & encoding ring buffer can have any size, including very small ones ( < 64 KB).
- Larger than encoding buffer, by a minimum of maxBlockSize more bytes.
maxBlockSize is implementation dependent. It's the maximum size you intend to compress into a single block.
In which case, encoding and decoding buffers do not need to be synchronized,
and encoding ring buffer can have any size, including small ones ( < 64 KB).
- _At least_ 64 KB + 8 bytes + maxBlockSize.
In which case, encoding and decoding buffers do not need to be synchronized,
and encoding ring buffer can have any size, including larger than decoding buffer.
Whenever these conditions are not possible, save the last 64KB of decoded data into a safe buffer,
and indicate where it is saved using LZ4_setStreamDecode()
*/
int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxDecompressedSize);
int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize);
/*
Advanced decoding functions :
*_usingDict() :
These decoding functions work the same as
a combination of LZ4_setStreamDecode() followed by LZ4_decompress_x_continue()
They are stand-alone. They don't need nor update an LZ4_streamDecode_t structure.
*/
int LZ4_decompress_safe_usingDict (const char* source, char* dest, int compressedSize, int maxDecompressedSize, const char* dictStart, int dictSize);
int LZ4_decompress_fast_usingDict (const char* source, char* dest, int originalSize, const char* dictStart, int dictSize);
/**************************************
* Obsolete Functions
**************************************/
/* Deprecate Warnings */
/* Should these warnings messages be a problem,
it is generally possible to disable them,
with -Wno-deprecated-declarations for gcc
or _CRT_SECURE_NO_WARNINGS in Visual for example.
You can also define LZ4_DEPRECATE_WARNING_DEFBLOCK. */
#ifndef LZ4_DEPRECATE_WARNING_DEFBLOCK
# define LZ4_DEPRECATE_WARNING_DEFBLOCK
# define LZ4_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
# if (LZ4_GCC_VERSION >= 405) || defined(__clang__)
# define LZ4_DEPRECATED(message) __attribute__((deprecated(message)))
# elif (LZ4_GCC_VERSION >= 301)
# define LZ4_DEPRECATED(message) __attribute__((deprecated))
# elif defined(_MSC_VER)
# define LZ4_DEPRECATED(message) __declspec(deprecated(message))
# else
# pragma message("WARNING: You need to implement LZ4_DEPRECATED for this compiler")
# define LZ4_DEPRECATED(message)
# endif
#endif /* LZ4_DEPRECATE_WARNING_DEFBLOCK */
/* Obsolete compression functions */
/* These functions are planned to start generate warnings by r131 approximately */
int LZ4_compress (const char* source, char* dest, int sourceSize);
int LZ4_compress_limitedOutput (const char* source, char* dest, int sourceSize, int maxOutputSize);
int LZ4_compress_withState (void* state, const char* source, char* dest, int inputSize);
int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize);
int LZ4_compress_continue (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize);
int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize, int maxOutputSize);
/* Obsolete decompression functions */
/* These function names are completely deprecated and must no longer be used.
They are only provided here for compatibility with older programs.
- LZ4_uncompress is the same as LZ4_decompress_fast
- LZ4_uncompress_unknownOutputSize is the same as LZ4_decompress_safe
These function prototypes are now disabled; uncomment them only if you really need them.
It is highly recommended to stop using these prototypes and migrate to maintained ones */
/* int LZ4_uncompress (const char* source, char* dest, int outputSize); */
/* int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize); */
/* Obsolete streaming functions; use new streaming interface whenever possible */
LZ4_DEPRECATED("use LZ4_createStream() instead") void* LZ4_create (char* inputBuffer);
LZ4_DEPRECATED("use LZ4_createStream() instead") int LZ4_sizeofStreamState(void);
LZ4_DEPRECATED("use LZ4_resetStream() instead") int LZ4_resetStreamState(void* state, char* inputBuffer);
LZ4_DEPRECATED("use LZ4_saveDict() instead") char* LZ4_slideInputBuffer (void* state);
/* Obsolete streaming decoding functions */
LZ4_DEPRECATED("use LZ4_decompress_safe_usingDict() instead") int LZ4_decompress_safe_withPrefix64k (const char* src, char* dst, int compressedSize, int maxDstSize);
LZ4_DEPRECATED("use LZ4_decompress_fast_usingDict() instead") int LZ4_decompress_fast_withPrefix64k (const char* src, char* dst, int originalSize);
#if defined (__cplusplus)
}
#endif
bitshuffle-0.3.5/lzf/000077500000000000000000000000001337005776700144645ustar00rootroot00000000000000bitshuffle-0.3.5/lzf/LICENSE.txt000066400000000000000000000030361337005776700163110ustar00rootroot00000000000000Copyright Notice and Statement for LZF filter
Copyright (c) 2008-2009 Andrew Collette
http://h5py.alfven.org
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
a. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
b. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the
distribution.
c. Neither the name of the author nor the names of contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
bitshuffle-0.3.5/lzf/README.txt000066400000000000000000000050671337005776700161720ustar00rootroot00000000000000===============================
LZF filter for HDF5, revision 3
===============================
The LZF filter provides high-speed compression with acceptable compression
performance, resulting in much faster performance than DEFLATE, at the
cost of a slightly lower compression ratio. It's appropriate for large
datasets of low to moderate complexity, for which some compression is
much better than none, but for which the speed of DEFLATE is unacceptable.
This filter has been tested against HDF5 versions 1.6.5 through 1.8.3. It
is released under the BSD license (see LICENSE.txt for details).
Using the filter from HDF5
--------------------------
There is exactly one new public function declared in lzf_filter.h, with
the following signature:
int register_lzf(void)
Calling this will register the filter with the HDF5 library. A non-negative
return value indicates success. If the registration fails, an error is pushed
onto the current error stack and a negative value is returned.
It's strongly recommended to use the SHUFFLE filter with LZF, as it's
cheap, supported by all current versions of HDF5, and can significantly
improve the compression ratio. An example C program ("example.c") is included
which demonstrates the proper use of the filter.
Compiling
---------
The filter consists of a single .c file and header, along with an embedded
version of the LZF compression library. Since the filter is stateless, it's
recommended to statically link the entire thing into your program; for
example:
$ gcc -O2 -lhdf5 lzf/*.c lzf_filter.c myprog.c -o myprog
It can also be built as a shared library, although you will have to install
the resulting library somewhere the runtime linker can find it:
$ gcc -O2 -lhdf5 -fPIC -shared lzf/*.c lzf_filter.c -o liblzf_filter.so
A similar procedure should be used for building C++ code. As in these
examples, using option -O1 or higher is strongly recommended for increased
performance.
Contact
-------
This filter is maintained as part of the HDF5 for Python (h5py) project. The
goal of h5py is to provide access to the majority of the HDF5 C API and feature
set from Python. The most recent version of h5py (1.1) includes the LZF
filter by default.
* Downloads and bug tracker: http://h5py.googlecode.com
* Main web site and documentation: http://h5py.alfven.org
* Contact email: h5py at alfven dot org
History of changes
------------------
Revision 3 (6/25/09)
Fix issue with changed filter struct definition under HDF5 1.8.3.
Revision 2
Minor speed enhancement.
Revision 1
Initial release.
bitshuffle-0.3.5/lzf/README_bitshuffle.txt000066400000000000000000000003151337005776700203740ustar00rootroot00000000000000The LZF filter for HDF5 is part of the h5py project (http://h5py.alfven.org).
The version included with bitshuffle is from version 2.3 of h5py with no
modifications other than the addition of this README.
bitshuffle-0.3.5/lzf/example.c000066400000000000000000000052661337005776700162740ustar00rootroot00000000000000/*
Copyright (C) 2009 Andrew Collette
http://h5py.alfven.org
License: BSD (see LICENSE.txt)
Example program demonstrating use of the LZF filter from C code.
To compile this program:
h5cc -DH5_USE_16_API lzf/*.c lzf_filter.c example.c -o example
To run:
$ ./example
Success!
$ h5ls -v test_lzf.hdf5
Opened "test_lzf.hdf5" with sec2 driver.
dset Dataset {100/100, 100/100, 100/100}
Location: 0:1:0:976
Links: 1
Modified: 2009-02-15 16:35:11 PST
Chunks: {1, 100, 100} 40000 bytes
Storage: 4000000 logical bytes, 174288 allocated bytes, 2295.05% utilization
Filter-0: shuffle-2 OPT {4}
Filter-1: lzf-32000 OPT {1, 261, 40000}
Type: native float
*/
#include
#include "hdf5.h"
#include "lzf_filter.h"
#define SIZE 100*100*100
#define SHAPE {100,100,100}
#define CHUNKSHAPE {1,100,100}
int main(){
static float data[SIZE];
static float data_out[SIZE];
const hsize_t shape[] = SHAPE;
const hsize_t chunkshape[] = CHUNKSHAPE;
int r, i;
int return_code = 1;
hid_t fid, sid, dset, plist = 0;
for(i=0; i0) H5Dclose(dset);
if(sid>0) H5Sclose(sid);
if(plist>0) H5Pclose(plist);
if(fid>0) H5Fclose(fid);
return return_code;
}
bitshuffle-0.3.5/lzf/lzf/000077500000000000000000000000001337005776700152575ustar00rootroot00000000000000bitshuffle-0.3.5/lzf/lzf/lzf.h000066400000000000000000000104751337005776700162320ustar00rootroot00000000000000/*
* Copyright (c) 2000-2008 Marc Alexander Lehmann
*
* Redistribution and use in source and binary forms, with or without modifica-
* tion, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
* CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
* EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
* CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
* ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
*
* Alternatively, the contents of this file may be used under the terms of
* the GNU General Public License ("GPL") version 2 or any later version,
* in which case the provisions of the GPL are applicable instead of
* the above. If you wish to allow the use of your version of this file
* only under the terms of the GPL and not to allow others to use your
* version of this file under the BSD license, indicate your decision
* by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL. If you do not delete the
* provisions above, a recipient may use your version of this file under
* either the BSD or the GPL.
*/
#ifndef LZF_H
#define LZF_H
/***********************************************************************
**
** lzf -- an extremely fast/free compression/decompression-method
** http://liblzf.plan9.de/
**
** This algorithm is believed to be patent-free.
**
***********************************************************************/
#define LZF_VERSION 0x0105 /* 1.5, API version */
/*
* Compress in_len bytes stored at the memory block starting at
* in_data and write the result to out_data, up to a maximum length
* of out_len bytes.
*
* If the output buffer is not large enough or any error occurs return 0,
* otherwise return the number of bytes used, which might be considerably
* more than in_len (but less than 104% of the original size), so it
* makes sense to always use out_len == in_len - 1), to ensure _some_
* compression, and store the data uncompressed otherwise (with a flag, of
* course.
*
* lzf_compress might use different algorithms on different systems and
* even different runs, thus might result in different compressed strings
* depending on the phase of the moon or similar factors. However, all
* these strings are architecture-independent and will result in the
* original data when decompressed using lzf_decompress.
*
* The buffers must not be overlapping.
*
* If the option LZF_STATE_ARG is enabled, an extra argument must be
* supplied which is not reflected in this header file. Refer to lzfP.h
* and lzf_c.c.
*
*/
unsigned int
lzf_compress (const void *const in_data, unsigned int in_len,
void *out_data, unsigned int out_len);
/*
* Decompress data compressed with some version of the lzf_compress
* function and stored at location in_data and length in_len. The result
* will be stored at out_data up to a maximum of out_len characters.
*
* If the output buffer is not large enough to hold the decompressed
* data, a 0 is returned and errno is set to E2BIG. Otherwise the number
* of decompressed bytes (i.e. the original length of the data) is
* returned.
*
* If an error in the compressed data is detected, a zero is returned and
* errno is set to EINVAL.
*
* This function is very fast, about as fast as a copying loop.
*/
unsigned int
lzf_decompress (const void *const in_data, unsigned int in_len,
void *out_data, unsigned int out_len);
#endif
bitshuffle-0.3.5/lzf/lzf/lzfP.h000066400000000000000000000125101337005776700163420ustar00rootroot00000000000000/*
* Copyright (c) 2000-2007 Marc Alexander Lehmann
*
* Redistribution and use in source and binary forms, with or without modifica-
* tion, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
* CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
* EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
* CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
* ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
*
* Alternatively, the contents of this file may be used under the terms of
* the GNU General Public License ("GPL") version 2 or any later version,
* in which case the provisions of the GPL are applicable instead of
* the above. If you wish to allow the use of your version of this file
* only under the terms of the GPL and not to allow others to use your
* version of this file under the BSD license, indicate your decision
* by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL. If you do not delete the
* provisions above, a recipient may use your version of this file under
* either the BSD or the GPL.
*/
#ifndef LZFP_h
#define LZFP_h
#define STANDALONE 1 /* at the moment, this is ok. */
#ifndef STANDALONE
# include "lzf.h"
#endif
/*
* Size of hashtable is (1 << HLOG) * sizeof (char *)
* decompression is independent of the hash table size
* the difference between 15 and 14 is very small
* for small blocks (and 14 is usually a bit faster).
* For a low-memory/faster configuration, use HLOG == 13;
* For best compression, use 15 or 16 (or more, up to 23).
*/
#ifndef HLOG
# define HLOG 17 /* Avoid pathological case at HLOG=16 A.C. 2/15/09 */
#endif
/*
* Sacrifice very little compression quality in favour of compression speed.
* This gives almost the same compression as the default code, and is
* (very roughly) 15% faster. This is the preferred mode of operation.
*/
#ifndef VERY_FAST
# define VERY_FAST 1
#endif
/*
* Sacrifice some more compression quality in favour of compression speed.
* (roughly 1-2% worse compression for large blocks and
* 9-10% for small, redundant, blocks and >>20% better speed in both cases)
* In short: when in need for speed, enable this for binary data,
* possibly disable this for text data.
*/
#ifndef ULTRA_FAST
# define ULTRA_FAST 1
#endif
/*
* Unconditionally aligning does not cost very much, so do it if unsure
*/
#ifndef STRICT_ALIGN
# define STRICT_ALIGN !(defined(__i386) || defined (__amd64))
#endif
/*
* You may choose to pre-set the hash table (might be faster on some
* modern cpus and large (>>64k) blocks, and also makes compression
* deterministic/repeatable when the configuration otherwise is the same).
*/
#ifndef INIT_HTAB
# define INIT_HTAB 0
#endif
/* =======================================================================
Changing things below this line may break the HDF5 LZF filter.
A.C. 2/15/09
=======================================================================
*/
/*
* Avoid assigning values to errno variable? for some embedding purposes
* (linux kernel for example), this is neccessary. NOTE: this breaks
* the documentation in lzf.h.
*/
#ifndef AVOID_ERRNO
# define AVOID_ERRNO 0
#endif
/*
* Wether to pass the LZF_STATE variable as argument, or allocate it
* on the stack. For small-stack environments, define this to 1.
* NOTE: this breaks the prototype in lzf.h.
*/
#ifndef LZF_STATE_ARG
# define LZF_STATE_ARG 0
#endif
/*
* Wether to add extra checks for input validity in lzf_decompress
* and return EINVAL if the input stream has been corrupted. This
* only shields against overflowing the input buffer and will not
* detect most corrupted streams.
* This check is not normally noticable on modern hardware
* (<1% slowdown), but might slow down older cpus considerably.
*/
#ifndef CHECK_INPUT
# define CHECK_INPUT 1
#endif
/*****************************************************************************/
/* nothing should be changed below */
typedef unsigned char u8;
typedef const u8 *LZF_STATE[1 << (HLOG)];
#if !STRICT_ALIGN
/* for unaligned accesses we need a 16 bit datatype. */
# include
# if USHRT_MAX == 65535
typedef unsigned short u16;
# elif UINT_MAX == 65535
typedef unsigned int u16;
# else
# undef STRICT_ALIGN
# define STRICT_ALIGN 1
# endif
#endif
#if ULTRA_FAST
# if defined(VERY_FAST)
# undef VERY_FAST
# endif
#endif
#if INIT_HTAB
# ifdef __cplusplus
# include
# else
# include
# endif
#endif
#endif
bitshuffle-0.3.5/lzf/lzf/lzf_c.c000066400000000000000000000214571337005776700165310ustar00rootroot00000000000000/*
* Copyright (c) 2000-2008 Marc Alexander Lehmann
*
* Redistribution and use in source and binary forms, with or without modifica-
* tion, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
* CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
* EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
* CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
* ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
*
* Alternatively, the contents of this file may be used under the terms of
* the GNU General Public License ("GPL") version 2 or any later version,
* in which case the provisions of the GPL are applicable instead of
* the above. If you wish to allow the use of your version of this file
* only under the terms of the GPL and not to allow others to use your
* version of this file under the BSD license, indicate your decision
* by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL. If you do not delete the
* provisions above, a recipient may use your version of this file under
* either the BSD or the GPL.
*/
#include "lzfP.h"
#define HSIZE (1 << (HLOG))
/*
* don't play with this unless you benchmark!
* decompression is not dependent on the hash function
* the hashing function might seem strange, just believe me
* it works ;)
*/
#ifndef FRST
# define FRST(p) (((p[0]) << 8) | p[1])
# define NEXT(v,p) (((v) << 8) | p[2])
# if ULTRA_FAST
# define IDX(h) ((( h >> (3*8 - HLOG)) - h ) & (HSIZE - 1))
# elif VERY_FAST
# define IDX(h) ((( h >> (3*8 - HLOG)) - h*5) & (HSIZE - 1))
# else
# define IDX(h) ((((h ^ (h << 5)) >> (3*8 - HLOG)) - h*5) & (HSIZE - 1))
# endif
#endif
/*
* IDX works because it is very similar to a multiplicative hash, e.g.
* ((h * 57321 >> (3*8 - HLOG)) & (HSIZE - 1))
* the latter is also quite fast on newer CPUs, and compresses similarly.
*
* the next one is also quite good, albeit slow ;)
* (int)(cos(h & 0xffffff) * 1e6)
*/
#if 0
/* original lzv-like hash function, much worse and thus slower */
# define FRST(p) (p[0] << 5) ^ p[1]
# define NEXT(v,p) ((v) << 5) ^ p[2]
# define IDX(h) ((h) & (HSIZE - 1))
#endif
#define MAX_LIT (1 << 5)
#define MAX_OFF (1 << 13)
#define MAX_REF ((1 << 8) + (1 << 3))
#if __GNUC__ >= 3
# define expect(expr,value) __builtin_expect ((expr),(value))
# define inline inline
#else
# define expect(expr,value) (expr)
# define inline static
#endif
#define expect_false(expr) expect ((expr) != 0, 0)
#define expect_true(expr) expect ((expr) != 0, 1)
/*
* compressed format
*
* 000LLLLL ; literal
* LLLooooo oooooooo ; backref L
* 111ooooo LLLLLLLL oooooooo ; backref L+7
*
*/
unsigned int
lzf_compress (const void *const in_data, unsigned int in_len,
void *out_data, unsigned int out_len
#if LZF_STATE_ARG
, LZF_STATE htab
#endif
)
{
#if !LZF_STATE_ARG
LZF_STATE htab;
#endif
const u8 **hslot;
const u8 *ip = (const u8 *)in_data;
u8 *op = (u8 *)out_data;
const u8 *in_end = ip + in_len;
u8 *out_end = op + out_len;
const u8 *ref;
/* off requires a type wide enough to hold a general pointer difference.
* ISO C doesn't have that (size_t might not be enough and ptrdiff_t only
* works for differences within a single object). We also assume that no
* no bit pattern traps. Since the only platform that is both non-POSIX
* and fails to support both assumptions is windows 64 bit, we make a
* special workaround for it.
*/
#if ( defined (WIN32) && defined (_M_X64) ) || defined (_WIN64)
unsigned _int64 off; /* workaround for missing POSIX compliance */
#else
unsigned long off;
#endif
unsigned int hval;
int lit;
if (!in_len || !out_len)
return 0;
#if INIT_HTAB
memset (htab, 0, sizeof (htab));
# if 0
for (hslot = htab; hslot < htab + HSIZE; hslot++)
*hslot++ = ip;
# endif
#endif
lit = 0; op++; /* start run */
hval = FRST (ip);
while (ip < in_end - 2)
{
hval = NEXT (hval, ip);
hslot = htab + IDX (hval);
ref = *hslot; *hslot = ip;
if (1
#if INIT_HTAB
&& ref < ip /* the next test will actually take care of this, but this is faster */
#endif
&& (off = ip - ref - 1) < MAX_OFF
&& ip + 4 < in_end
&& ref > (u8 *)in_data
#if STRICT_ALIGN
&& ref[0] == ip[0]
&& ref[1] == ip[1]
&& ref[2] == ip[2]
#else
&& *(u16 *)ref == *(u16 *)ip
&& ref[2] == ip[2]
#endif
)
{
/* match found at *ref++ */
unsigned int len = 2;
unsigned int maxlen = in_end - ip - len;
maxlen = maxlen > MAX_REF ? MAX_REF : maxlen;
if (expect_false (op + 3 + 1 >= out_end)) /* first a faster conservative test */
if (op - !lit + 3 + 1 >= out_end) /* second the exact but rare test */
return 0;
op [- lit - 1] = lit - 1; /* stop run */
op -= !lit; /* undo run if length is zero */
for (;;)
{
if (expect_true (maxlen > 16))
{
len++; if (ref [len] != ip [len]) break;
len++; if (ref [len] != ip [len]) break;
len++; if (ref [len] != ip [len]) break;
len++; if (ref [len] != ip [len]) break;
len++; if (ref [len] != ip [len]) break;
len++; if (ref [len] != ip [len]) break;
len++; if (ref [len] != ip [len]) break;
len++; if (ref [len] != ip [len]) break;
len++; if (ref [len] != ip [len]) break;
len++; if (ref [len] != ip [len]) break;
len++; if (ref [len] != ip [len]) break;
len++; if (ref [len] != ip [len]) break;
len++; if (ref [len] != ip [len]) break;
len++; if (ref [len] != ip [len]) break;
len++; if (ref [len] != ip [len]) break;
len++; if (ref [len] != ip [len]) break;
}
do
len++;
while (len < maxlen && ref[len] == ip[len]);
break;
}
len -= 2; /* len is now #octets - 1 */
ip++;
if (len < 7)
{
*op++ = (off >> 8) + (len << 5);
}
else
{
*op++ = (off >> 8) + ( 7 << 5);
*op++ = len - 7;
}
*op++ = off;
lit = 0; op++; /* start run */
ip += len + 1;
if (expect_false (ip >= in_end - 2))
break;
#if ULTRA_FAST || VERY_FAST
--ip;
# if VERY_FAST && !ULTRA_FAST
--ip;
# endif
hval = FRST (ip);
hval = NEXT (hval, ip);
htab[IDX (hval)] = ip;
ip++;
# if VERY_FAST && !ULTRA_FAST
hval = NEXT (hval, ip);
htab[IDX (hval)] = ip;
ip++;
# endif
#else
ip -= len + 1;
do
{
hval = NEXT (hval, ip);
htab[IDX (hval)] = ip;
ip++;
}
while (len--);
#endif
}
else
{
/* one more literal byte we must copy */
if (expect_false (op >= out_end))
return 0;
lit++; *op++ = *ip++;
if (expect_false (lit == MAX_LIT))
{
op [- lit - 1] = lit - 1; /* stop run */
lit = 0; op++; /* start run */
}
}
}
if (op + 3 > out_end) /* at most 3 bytes can be missing here */
return 0;
while (ip < in_end)
{
lit++; *op++ = *ip++;
if (expect_false (lit == MAX_LIT))
{
op [- lit - 1] = lit - 1; /* stop run */
lit = 0; op++; /* start run */
}
}
op [- lit - 1] = lit - 1; /* end run */
op -= !lit; /* undo run if length is zero */
return op - (u8 *)out_data;
}
bitshuffle-0.3.5/lzf/lzf/lzf_d.c000066400000000000000000000105051337005776700165220ustar00rootroot00000000000000/*
* Copyright (c) 2000-2007 Marc Alexander Lehmann
*
* Redistribution and use in source and binary forms, with or without modifica-
* tion, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
* CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
* EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
* CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
* ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
*
* Alternatively, the contents of this file may be used under the terms of
* the GNU General Public License ("GPL") version 2 or any later version,
* in which case the provisions of the GPL are applicable instead of
* the above. If you wish to allow the use of your version of this file
* only under the terms of the GPL and not to allow others to use your
* version of this file under the BSD license, indicate your decision
* by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL. If you do not delete the
* provisions above, a recipient may use your version of this file under
* either the BSD or the GPL.
*/
#include "lzfP.h"
#if AVOID_ERRNO
# define SET_ERRNO(n)
#else
# include
# define SET_ERRNO(n) errno = (n)
#endif
/* ASM is slower than C in HDF5 tests -- A.C. 2/5/09
#ifndef __STRICT_ANSI__
#ifndef H5PY_DISABLE_LZF_ASM
#if (__i386 || __amd64) && __GNUC__ >= 3
# define lzf_movsb(dst, src, len) \
asm ("rep movsb" \
: "=D" (dst), "=S" (src), "=c" (len) \
: "0" (dst), "1" (src), "2" (len));
#endif
#endif
#endif
*/
unsigned int
lzf_decompress (const void *const in_data, unsigned int in_len,
void *out_data, unsigned int out_len)
{
u8 const *ip = (const u8 *)in_data;
u8 *op = (u8 *)out_data;
u8 const *const in_end = ip + in_len;
u8 *const out_end = op + out_len;
do
{
unsigned int ctrl = *ip++;
if (ctrl < (1 << 5)) /* literal run */
{
ctrl++;
if (op + ctrl > out_end)
{
SET_ERRNO (E2BIG);
return 0;
}
#if CHECK_INPUT
if (ip + ctrl > in_end)
{
SET_ERRNO (EINVAL);
return 0;
}
#endif
#ifdef lzf_movsb
lzf_movsb (op, ip, ctrl);
#else
do
*op++ = *ip++;
while (--ctrl);
#endif
}
else /* back reference */
{
unsigned int len = ctrl >> 5;
u8 *ref = op - ((ctrl & 0x1f) << 8) - 1;
#if CHECK_INPUT
if (ip >= in_end)
{
SET_ERRNO (EINVAL);
return 0;
}
#endif
if (len == 7)
{
len += *ip++;
#if CHECK_INPUT
if (ip >= in_end)
{
SET_ERRNO (EINVAL);
return 0;
}
#endif
}
ref -= *ip++;
if (op + len + 2 > out_end)
{
SET_ERRNO (E2BIG);
return 0;
}
if (ref < (u8 *)out_data)
{
SET_ERRNO (EINVAL);
return 0;
}
#ifdef lzf_movsb
len += 2;
lzf_movsb (op, ref, len);
#else
*op++ = *ref++;
*op++ = *ref++;
do
*op++ = *ref++;
while (--len);
#endif
}
}
while (ip < in_end);
return op - (u8 *)out_data;
}
bitshuffle-0.3.5/lzf/lzf_filter.c000066400000000000000000000154201337005776700167720ustar00rootroot00000000000000/***** Preamble block *********************************************************
*
* This file is part of h5py, a low-level Python interface to the HDF5 library.
*
* Copyright (C) 2008 Andrew Collette
* http://h5py.alfven.org
* License: BSD (See LICENSE.txt for full license)
*
* $Date$
*
****** End preamble block ****************************************************/
/*
Implements an LZF filter module for HDF5, using the BSD-licensed library
by Marc Alexander Lehmann (http://www.goof.com/pcg/marc/liblzf.html).
No Python-specific code is used. The filter behaves like the DEFLATE
filter, in that it is called for every type and space, and returns 0
if the data cannot be compressed.
The only public function is (int) register_lzf(void), which passes on
the result from H5Zregister.
*/
#include
#include
#include
#include "hdf5.h"
#include "lzf/lzf.h"
#include "lzf_filter.h"
/* Our own versions of H5Epush_sim, as it changed in 1.8 */
#if H5_VERS_MAJOR == 1 && H5_VERS_MINOR < 7
#define PUSH_ERR(func, minor, str) H5Epush(__FILE__, func, __LINE__, H5E_PLINE, minor, str)
#define H5PY_GET_FILTER H5Pget_filter_by_id
#else
#define PUSH_ERR(func, minor, str) H5Epush1(__FILE__, func, __LINE__, H5E_PLINE, minor, str)
#define H5PY_GET_FILTER(a,b,c,d,e,f,g) H5Pget_filter_by_id2(a,b,c,d,e,f,g,NULL)
#endif
/* Deal with the mutiple definitions for H5Z_class_t.
Note: Only HDF5 1.6 and 1.8 are supported.
(1) The old class should always be used for HDF5 1.6
(2) The new class should always be used for HDF5 1.8 < 1.8.3
(3) The old class should be used for HDF5 1.8 >= 1.8.3 only if the
macro H5_USE_16_API is set
*/
#if H5_VERS_MAJOR == 1 && H5_VERS_MINOR == 8 && (H5_VERS_RELEASE < 3 || !H5_USE_16_API)
#define H5PY_H5Z_NEWCLS 1
#else
#define H5PY_H5Z_NEWCLS 0
#endif
size_t lzf_filter(unsigned flags, size_t cd_nelmts,
const unsigned cd_values[], size_t nbytes,
size_t *buf_size, void **buf);
herr_t lzf_set_local(hid_t dcpl, hid_t type, hid_t space);
/* Try to register the filter, passing on the HDF5 return value */
int register_lzf(void){
int retval;
#if H5PY_H5Z_NEWCLS
H5Z_class_t filter_class = {
H5Z_CLASS_T_VERS,
(H5Z_filter_t)(H5PY_FILTER_LZF),
1, 1,
"lzf",
NULL,
(H5Z_set_local_func_t)(lzf_set_local),
(H5Z_func_t)(lzf_filter)
};
#else
H5Z_class_t filter_class = {
(H5Z_filter_t)(H5PY_FILTER_LZF),
"lzf",
NULL,
(H5Z_set_local_func_t)(lzf_set_local),
(H5Z_func_t)(lzf_filter)
};
#endif
retval = H5Zregister(&filter_class);
if(retval<0){
PUSH_ERR("register_lzf", H5E_CANTREGISTER, "Can't register LZF filter");
}
return retval;
}
/* Filter setup. Records the following inside the DCPL:
1. If version information is not present, set slots 0 and 1 to the filter
revision and LZF API version, respectively.
2. Compute the chunk size in bytes and store it in slot 2.
*/
herr_t lzf_set_local(hid_t dcpl, hid_t type, hid_t space){
int ndims;
int i;
herr_t r;
unsigned int bufsize;
hsize_t chunkdims[32];
unsigned int flags;
size_t nelements = 8;
unsigned values[] = {0,0,0,0,0,0,0,0};
r = H5PY_GET_FILTER(dcpl, H5PY_FILTER_LZF, &flags, &nelements, values, 0, NULL);
if(r<0) return -1;
if(nelements < 3) nelements = 3; /* First 3 slots reserved. If any higher
slots are used, preserve the contents. */
/* It seems the H5Z_FLAG_REVERSE flag doesn't work here, so we have to be
careful not to clobber any existing version info */
if(values[0]==0) values[0] = H5PY_FILTER_LZF_VERSION;
if(values[1]==0) values[1] = LZF_VERSION;
ndims = H5Pget_chunk(dcpl, 32, chunkdims);
if(ndims<0) return -1;
if(ndims>32){
PUSH_ERR("lzf_set_local", H5E_CALLBACK, "Chunk rank exceeds limit");
return -1;
}
bufsize = H5Tget_size(type);
if(bufsize==0) return -1;
for(i=0;i=3)&&(cd_values[2]!=0)){
outbuf_size = cd_values[2]; /* Precomputed buffer guess */
}else{
outbuf_size = (*buf_size);
}
#ifdef H5PY_LZF_DEBUG
fprintf(stderr, "Decompress %d chunk w/buffer %d\n", nbytes, outbuf_size);
#endif
while(!status){
free(outbuf);
outbuf = malloc(outbuf_size);
if(outbuf == NULL){
PUSH_ERR("lzf_filter", H5E_CALLBACK, "Can't allocate decompression buffer");
goto failed;
}
status = lzf_decompress(*buf, nbytes, outbuf, outbuf_size);
if(!status){ /* compression failed */
if(errno == E2BIG){
outbuf_size += (*buf_size);
#ifdef H5PY_LZF_DEBUG
fprintf(stderr, " Too small: %d\n", outbuf_size);
#endif
} else if(errno == EINVAL) {
PUSH_ERR("lzf_filter", H5E_CALLBACK, "Invalid data for LZF decompression");
goto failed;
} else {
PUSH_ERR("lzf_filter", H5E_CALLBACK, "Unknown LZF decompression error");
goto failed;
}
} /* if !status */
} /* while !status */
} /* compressing vs decompressing */
if(status != 0){
free(*buf);
*buf = outbuf;
*buf_size = outbuf_size;
return status; /* Size of compressed/decompressed data */
}
failed:
free(outbuf);
return 0;
} /* End filter function */
bitshuffle-0.3.5/lzf/lzf_filter.h000066400000000000000000000015521337005776700170000ustar00rootroot00000000000000/***** Preamble block *********************************************************
*
* This file is part of h5py, a low-level Python interface to the HDF5 library.
*
* Copyright (C) 2008 Andrew Collette
* http://h5py.alfven.org
* License: BSD (See LICENSE.txt for full license)
*
* $Date$
*
****** End preamble block ****************************************************/
#ifndef H5PY_LZF_H
#define H5PY_LZF_H
#ifdef __cplusplus
extern "C" {
#endif
/* Filter revision number, starting at 1 */
#define H5PY_FILTER_LZF_VERSION 4
/* Filter ID registered with the HDF Group as of 2/6/09. For maintenance
requests, contact the filter author directly. */
#define H5PY_FILTER_LZF 32000
/* Register the filter with the library. Returns a negative value on failure,
and a non-negative value on success.
*/
int register_lzf(void);
#ifdef __cplusplus
}
#endif
#endif
bitshuffle-0.3.5/requirements.txt000066400000000000000000000001271337005776700171550ustar00rootroot00000000000000# Order matters
setuptools>=0.7
Cython>=0.19
numpy>=1.6.1
h5py>=2.4.0 --no-binary=h5py
bitshuffle-0.3.5/setup.cfg.example000066400000000000000000000005201337005776700171410ustar00rootroot00000000000000[install]
# These control the installation of the hdf5 dynamically loaded filter plugin.
h5plugin = 0
h5plugin-dir = /usr/local/hdf5/lib/plugin
[build_ext]
# Whether to compile with OpenMP multi-threading. Default is system dependant:
# False on OSX (since the clang compiler does not yet support OpenMP) and True
# otherwise.
omp = 1
bitshuffle-0.3.5/setup.py000066400000000000000000000265231337005776700154130ustar00rootroot00000000000000from __future__ import absolute_import, division, print_function
# I didn't import unicode_literals. They break setuptools or Cython in python
# 2.7, but python 3 seems to be happy with them.
import glob
import os
from os import path
from setuptools import setup, Extension
from setuptools.command.build_ext import build_ext as build_ext_
from setuptools.command.develop import develop as develop_
from setuptools.command.install import install as install_
import shutil
import subprocess
import sys
VERSION_MAJOR = 0
VERSION_MINOR = 3
VERSION_POINT = 5
# Only unset in the 'release' branch and in tags.
VERSION_DEV = 0
VERSION = "%d.%d.%d" % (VERSION_MAJOR, VERSION_MINOR, VERSION_POINT)
if VERSION_DEV:
VERSION = VERSION + ".dev%d" % VERSION_DEV
COMPILE_FLAGS = ['-O3', '-ffast-math', '-march=native', '-std=c99']
# Cython breaks strict aliasing rules.
COMPILE_FLAGS += ['-fno-strict-aliasing']
COMPILE_FLAGS += ['-fPIC']
COMPILE_FLAGS_MSVC = ['/Ox', '/fp:fast']
MACROS = [
('BSHUF_VERSION_MAJOR', VERSION_MAJOR),
('BSHUF_VERSION_MINOR', VERSION_MINOR),
('BSHUF_VERSION_POINT', VERSION_POINT),
]
H5PLUGINS_DEFAULT = '/usr/local/hdf5/lib/plugin'
# OSX's clang compliler does not support OpenMP.
if sys.platform == 'darwin':
OMP_DEFAULT = False
else:
OMP_DEFAULT = True
FALLBACK_CONFIG = {
'include_dirs': [],
'library_dirs': [],
'libraries': [],
'extra_compile_args': [],
'extra_link_args': [],
}
if 'HDF5_DIR' in os.environ:
FALLBACK_CONFIG['include_dirs'] += [os.environ['HDF5_DIR'] + '/include'] # macports
FALLBACK_CONFIG['library_dirs'] += [os.environ['HDF5_DIR'] + '/lib'] # macports
elif sys.platform == 'darwin':
# putting here both macports and homebrew paths will generate
# "ld: warning: dir not found" at the linking phase
FALLBACK_CONFIG['include_dirs'] += ['/opt/local/include'] # macports
FALLBACK_CONFIG['library_dirs'] += ['/opt/local/lib'] # macports
FALLBACK_CONFIG['include_dirs'] += ['/usr/local/include'] # homebrew
FALLBACK_CONFIG['library_dirs'] += ['/usr/local/lib'] # homebrew
elif sys.platform.startswith('freebsd'):
FALLBACK_CONFIG['include_dirs'] += ['/usr/local/include'] # homebrew
FALLBACK_CONFIG['library_dirs'] += ['/usr/local/lib'] # homebrew
FALLBACK_CONFIG['include_dirs'] = [d for d in FALLBACK_CONFIG['include_dirs']
if path.isdir(d)]
FALLBACK_CONFIG['library_dirs'] = [d for d in FALLBACK_CONFIG['library_dirs']
if path.isdir(d)]
FALLBACK_CONFIG['extra_compile_args'] = ['-DH5_BUILT_AS_DYNAMIC_LIB']
def pkgconfig(*packages, **kw):
config = kw.setdefault('config', {})
optional_args = kw.setdefault('optional', '')
flag_map = {'include_dirs': ['--cflags-only-I', 2],
'library_dirs': ['--libs-only-L', 2],
'libraries': ['--libs-only-l', 2],
'extra_compile_args': ['--cflags-only-other', 0],
'extra_link_args': ['--libs-only-other', 0],
}
for package in packages:
try:
subprocess.check_output(["pkg-config", package])
except (subprocess.CalledProcessError, OSError):
print("Can't find %s with pkg-config fallback to "
"static config" % package)
for distutils_key in flag_map:
config.setdefault(distutils_key, []).extend(
FALLBACK_CONFIG[distutils_key])
config['libraries'].append(package)
else:
for distutils_key, (pkg_option, n) in flag_map.items():
items = subprocess.check_output(
['pkg-config', optional_args, pkg_option, package]
).decode('utf8').split()
opt = config.setdefault(distutils_key, [])
opt.extend([i[n:] for i in items])
return config
ext_bshuf = Extension(
"bitshuffle.ext",
sources=["bitshuffle/ext.pyx", "src/bitshuffle.c",
"src/bitshuffle_core.c", "src/iochain.c",
"lz4/lz4.c"],
include_dirs=["src/", "lz4/"],
depends=["src/bitshuffle.h", "src/bitshuffle_core.h",
"src/iochain.h", "lz4/lz4.h"],
libraries=[],
define_macros=MACROS,
)
h5filter = Extension(
"bitshuffle.h5",
sources=["bitshuffle/h5.pyx", "src/bshuf_h5filter.c",
"src/bitshuffle.c", "src/bitshuffle_core.c",
"src/iochain.c", "lz4/lz4.c"],
depends=["src/bitshuffle.h", "src/bitshuffle_core.h",
"src/iochain.h", "src/bshuf_h5filter.h",
"lz4/lz4.h"],
define_macros=MACROS,
**pkgconfig("hdf5", config=dict(
include_dirs=["src/", "lz4/"]))
)
filter_plugin = Extension(
"bitshuffle.plugin.libh5bshuf",
sources=["src/bshuf_h5plugin.c", "src/bshuf_h5filter.c",
"src/bitshuffle.c", "src/bitshuffle_core.c",
"src/iochain.c", "lz4/lz4.c"],
depends=["src/bitshuffle.h", "src/bitshuffle_core.h",
"src/iochain.h", 'src/bshuf_h5filter.h',
"lz4/lz4.h"],
define_macros=MACROS,
**pkgconfig("hdf5", config=dict(
include_dirs=["src/", "lz4/"]))
)
lzf_plugin = Extension(
"bitshuffle.plugin.libh5LZF",
sources=["src/lzf_h5plugin.c", "lzf/lzf_filter.c",
"lzf/lzf/lzf_c.c", "lzf/lzf/lzf_d.c"],
depends=["lzf/lzf_filter.h", "lzf/lzf/lzf.h",
"lzf/lzf/lzfP.h"],
**pkgconfig("hdf5", config=dict(
include_dirs=["lzf/", "lzf/lzf/"]))
)
EXTENSIONS = [ext_bshuf, h5filter]
# Check for plugin hdf5 plugin support (hdf5 >= 1.8.11)
HDF5_PLUGIN_SUPPORT = False
CPATHS = os.environ['CPATH'].split(':') if 'CPATH' in os.environ else []
for p in ["/usr/include"] + pkgconfig("hdf5")["include_dirs"] + CPATHS:
if os.path.exists(os.path.join(p, "H5PLextern.h")):
HDF5_PLUGIN_SUPPORT = True
if HDF5_PLUGIN_SUPPORT:
EXTENSIONS.extend([filter_plugin, lzf_plugin])
class develop(develop_):
def run(self):
# Dummy directory for copying build plugins.
if not path.isdir('bitshuffle/plugin'):
os.mkdir('bitshuffle/plugin')
develop_.run(self)
# Custom installation to include installing dynamic filters.
class install(install_):
user_options = install_.user_options + [
('h5plugin', None,
'Install HDF5 filter plugins for use outside of python.'),
('h5plugin-dir=', None,
'Where to install filter plugins. Default %s.' % H5PLUGINS_DEFAULT),
]
def initialize_options(self):
install_.initialize_options(self)
self.h5plugin = False
self.h5plugin_dir = H5PLUGINS_DEFAULT
def finalize_options(self):
install_.finalize_options(self)
if self.h5plugin not in ('0', '1', True, False):
raise ValueError("Invalid h5plugin argument. Mut be '0' or '1'.")
self.h5plugin = int(self.h5plugin)
self.h5plugin_dir = path.abspath(self.h5plugin_dir)
def run(self):
install_.run(self)
if self.h5plugin:
if not HDF5_PLUGIN_SUPPORT:
print("HDF5 < 1.8.11, not installing filter plugins.")
return
plugin_build = path.join(self.build_lib, "bitshuffle", "plugin")
try:
os.makedirs(self.h5plugin_dir)
except OSError as e:
if e.args[0] == 17:
# Directory already exists, this is fine.
pass
else:
raise
plugin_libs = glob.glob(path.join(plugin_build, "*"))
for plugin_lib in plugin_libs:
plugin_name = path.split(plugin_lib)[1]
shutil.copy2(plugin_lib,
path.join(self.h5plugin_dir, plugin_name))
print("Installed HDF5 filter plugins to %s" % self.h5plugin_dir)
# Command line or site.cfg specification of OpenMP.
class build_ext(build_ext_):
user_options = build_ext_.user_options + [
('omp=', None, "Whether to compile with OpenMP threading. Default"
" on current system is %s." % str(OMP_DEFAULT))
]
boolean_options = build_ext_.boolean_options + ['omp']
def initialize_options(self):
build_ext_.initialize_options(self)
self.omp = OMP_DEFAULT
def finalize_options(self):
# For some reason this gets run twice. Careful to print messages and
# add arguments only one time.
build_ext_.finalize_options(self)
if self.omp not in ('0', '1', True, False):
raise ValueError("Invalid omp argument. Mut be '0' or '1'.")
self.omp = int(self.omp)
import numpy as np
ext_bshuf.include_dirs.append(np.get_include())
# Required only by old version of setuptools < 18.0
from Cython.Build import cythonize
self.extensions = cythonize(self.extensions)
for ext in self.extensions:
ext._needs_stub = False
def build_extensions(self):
c = self.compiler.compiler_type
if self.omp not in ('0', '1', True, False):
raise ValueError("Invalid omp argument. Mut be '0' or '1'.")
self.omp = int(self.omp)
if self.omp:
if not hasattr(self, "_printed_omp_message"):
self._printed_omp_message = True
print("\n#################################")
print("# Compiling with OpenMP support #")
print("#################################\n")
# More portable to pass -fopenmp to linker.
# self.libraries += ['gomp']
if self.compiler.compiler_type == 'msvc':
openmpflag = '/openmp'
compileflags = COMPILE_FLAGS_MSVC
else:
openmpflag = '-fopenmp'
compileflags = COMPILE_FLAGS
for e in self.extensions:
e.extra_compile_args = list(set(e.extra_compile_args).union(compileflags))
if openmpflag not in e.extra_compile_args:
e.extra_compile_args += [openmpflag]
if openmpflag not in e.extra_link_args:
e.extra_link_args += [openmpflag]
build_ext_.build_extensions(self)
# Don't install numpy/cython/hdf5 if not needed
for cmd in ["sdist", "clean",
"--help", "--help-commands", "--version"]:
if cmd in sys.argv:
setup_requires = []
break
else:
setup_requires = ["Cython>=0.19", "numpy>=1.6.1"]
with open('requirements.txt') as f:
requires = f.read().splitlines()
requires = [r.split()[0] for r in requires]
with open('README.rst') as r:
long_description = r.read()
# TODO hdf5 support should be an "extra". Figure out how to set this up.
setup(
name='bitshuffle',
version=VERSION,
packages=['bitshuffle', 'bitshuffle.tests'],
scripts=[],
ext_modules=EXTENSIONS,
cmdclass={'build_ext': build_ext, 'install': install, 'develop': develop},
setup_requires=setup_requires,
install_requires=requires,
# extras_require={'H5': ["h5py"]},
package_data={'': ['data/*']},
# metadata for upload to PyPI
author="Kiyoshi Wesley Masui",
author_email="kiyo@physics.ubc.ca",
description="Bitshuffle filter for improving typed data compression.",
long_description=long_description,
license="MIT",
url="https://github.com/kiyo-masui/bitshuffle",
download_url=("https://github.com/kiyo-masui/bitshuffle/tarball/%s"
% VERSION),
keywords=['compression', 'hdf5', 'numpy'],
)
bitshuffle-0.3.5/src/000077500000000000000000000000001337005776700144605ustar00rootroot00000000000000bitshuffle-0.3.5/src/bitshuffle.c000066400000000000000000000111151337005776700167560ustar00rootroot00000000000000/*
* Bitshuffle - Filter for improving compression of typed binary data.
*
* Author: Kiyoshi Masui
* Website: http://www.github.com/kiyo-masui/bitshuffle
* Created: 2014
*
* See LICENSE file for details about copyright and rights to use.
*
*/
#include "bitshuffle.h"
#include "bitshuffle_core.h"
#include "bitshuffle_internals.h"
#include "lz4.h"
#include
#include
// Constants.
// Use fast decompression instead of safe decompression for LZ4.
#define BSHUF_LZ4_DECOMPRESS_FAST
// Macros.
#define CHECK_ERR_FREE_LZ(count, buf) if (count < 0) { \
free(buf); return count - 1000; }
/* Bitshuffle and compress a single block. */
int64_t bshuf_compress_lz4_block(ioc_chain *C_ptr, \
const size_t size, const size_t elem_size) {
int64_t nbytes, count;
void *tmp_buf_bshuf;
void *tmp_buf_lz4;
size_t this_iter;
const void *in;
void *out;
tmp_buf_bshuf = malloc(size * elem_size);
if (tmp_buf_bshuf == NULL) return -1;
tmp_buf_lz4 = malloc(LZ4_compressBound(size * elem_size));
if (tmp_buf_lz4 == NULL){
free(tmp_buf_bshuf);
return -1;
}
in = ioc_get_in(C_ptr, &this_iter);
ioc_set_next_in(C_ptr, &this_iter, (void*) ((char*) in + size * elem_size));
count = bshuf_trans_bit_elem(in, tmp_buf_bshuf, size, elem_size);
if (count < 0) {
free(tmp_buf_lz4);
free(tmp_buf_bshuf);
return count;
}
nbytes = LZ4_compress((const char*) tmp_buf_bshuf, (char*) tmp_buf_lz4, size * elem_size);
free(tmp_buf_bshuf);
CHECK_ERR_FREE_LZ(nbytes, tmp_buf_lz4);
out = ioc_get_out(C_ptr, &this_iter);
ioc_set_next_out(C_ptr, &this_iter, (void *) ((char *) out + nbytes + 4));
bshuf_write_uint32_BE(out, nbytes);
memcpy((char *) out + 4, tmp_buf_lz4, nbytes);
free(tmp_buf_lz4);
return nbytes + 4;
}
/* Decompress and bitunshuffle a single block. */
int64_t bshuf_decompress_lz4_block(ioc_chain *C_ptr,
const size_t size, const size_t elem_size) {
int64_t nbytes, count;
void *out, *tmp_buf;
const void *in;
size_t this_iter;
int32_t nbytes_from_header;
in = ioc_get_in(C_ptr, &this_iter);
nbytes_from_header = bshuf_read_uint32_BE(in);
ioc_set_next_in(C_ptr, &this_iter,
(void*) ((char*) in + nbytes_from_header + 4));
out = ioc_get_out(C_ptr, &this_iter);
ioc_set_next_out(C_ptr, &this_iter,
(void *) ((char *) out + size * elem_size));
tmp_buf = malloc(size * elem_size);
if (tmp_buf == NULL) return -1;
#ifdef BSHUF_LZ4_DECOMPRESS_FAST
nbytes = LZ4_decompress_fast((const char*) in + 4, (char*) tmp_buf, size * elem_size);
CHECK_ERR_FREE_LZ(nbytes, tmp_buf);
if (nbytes != nbytes_from_header) {
free(tmp_buf);
return -91;
}
#else
nbytes = LZ4_decompress_safe((const char*) in + 4, (char *) tmp_buf, nbytes_from_header,
size * elem_size);
CHECK_ERR_FREE_LZ(nbytes, tmp_buf);
if (nbytes != size * elem_size) {
free(tmp_buf);
return -91;
}
nbytes = nbytes_from_header;
#endif
count = bshuf_untrans_bit_elem(tmp_buf, out, size, elem_size);
CHECK_ERR_FREE(count, tmp_buf);
nbytes += 4;
free(tmp_buf);
return nbytes;
}
/* ---- Public functions ----
*
* See header file for description and usage.
*
*/
size_t bshuf_compress_lz4_bound(const size_t size,
const size_t elem_size, size_t block_size) {
size_t bound, leftover;
if (block_size == 0) {
block_size = bshuf_default_block_size(elem_size);
}
if (block_size % BSHUF_BLOCKED_MULT) return -81;
// Note that each block gets a 4 byte header.
// Size of full blocks.
bound = (LZ4_compressBound(block_size * elem_size) + 4) * (size / block_size);
// Size of partial blocks, if any.
leftover = ((size % block_size) / BSHUF_BLOCKED_MULT) * BSHUF_BLOCKED_MULT;
if (leftover) bound += LZ4_compressBound(leftover * elem_size) + 4;
// Size of uncompressed data not fitting into any blocks.
bound += (size % BSHUF_BLOCKED_MULT) * elem_size;
return bound;
}
int64_t bshuf_compress_lz4(const void* in, void* out, const size_t size,
const size_t elem_size, size_t block_size) {
return bshuf_blocked_wrap_fun(&bshuf_compress_lz4_block, in, out, size,
elem_size, block_size);
}
int64_t bshuf_decompress_lz4(const void* in, void* out, const size_t size,
const size_t elem_size, size_t block_size) {
return bshuf_blocked_wrap_fun(&bshuf_decompress_lz4_block, in, out, size,
elem_size, block_size);
}
bitshuffle-0.3.5/src/bitshuffle.h000066400000000000000000000072161337005776700167720ustar00rootroot00000000000000/*
* Bitshuffle - Filter for improving compression of typed binary data.
*
* This file is part of Bitshuffle
* Author: Kiyoshi Masui
* Website: http://www.github.com/kiyo-masui/bitshuffle
* Created: 2014
*
* See LICENSE file for details about copyright and rights to use.
*
*
* Header File
*
* Worker routines return an int64_t which is the number of bytes processed
* if positive or an error code if negative.
*
* Error codes:
* -1 : Failed to allocate memory.
* -11 : Missing SSE.
* -12 : Missing AVX.
* -80 : Input size not a multiple of 8.
* -81 : block_size not multiple of 8.
* -91 : Decompression error, wrong number of bytes processed.
* -1YYY : Error internal to compression routine with error code -YYY.
*/
#ifndef BITSHUFFLE_H
#define BITSHUFFLE_H
#include
#include "bitshuffle_core.h"
#ifdef __cplusplus
extern "C" {
#endif
/* ---- bshuf_compress_lz4_bound ----
*
* Bound on size of data compressed with *bshuf_compress_lz4*.
*
* Parameters
* ----------
* size : number of elements in input
* elem_size : element size of typed data
* block_size : Process in blocks of this many elements. Pass 0 to
* select automatically (recommended).
*
* Returns
* -------
* Bound on compressed data size.
*
*/
size_t bshuf_compress_lz4_bound(const size_t size,
const size_t elem_size, size_t block_size);
/* ---- bshuf_compress_lz4 ----
*
* Bitshuffled and compress the data using LZ4.
*
* Transpose within elements, in blocks of data of *block_size* elements then
* compress the blocks using LZ4. In the output buffer, each block is prefixed
* by a 4 byte integer giving the compressed size of that block.
*
* Output buffer must be large enough to hold the compressed data. This could
* be in principle substantially larger than the input buffer. Use the routine
* *bshuf_compress_lz4_bound* to get an upper limit.
*
* Parameters
* ----------
* in : input buffer, must be of size * elem_size bytes
* out : output buffer, must be large enough to hold data.
* size : number of elements in input
* elem_size : element size of typed data
* block_size : Process in blocks of this many elements. Pass 0 to
* select automatically (recommended).
*
* Returns
* -------
* number of bytes used in output buffer, negative error-code if failed.
*
*/
int64_t bshuf_compress_lz4(const void* in, void* out, const size_t size, const size_t
elem_size, size_t block_size);
/* ---- bshuf_decompress_lz4 ----
*
* Undo compression and bitshuffling.
*
* Decompress data then un-bitshuffle it in blocks of *block_size* elements.
*
* To properly unshuffle bitshuffled data, *size*, *elem_size* and *block_size*
* must patch the parameters used to compress the data.
*
* NOT TO BE USED WITH UNTRUSTED DATA: This routine uses the function
* LZ4_decompress_fast from LZ4, which does not protect against maliciously
* formed datasets. By modifying the compressed data, this function could be
* coerced into leaving the boundaries of the input buffer.
*
* Parameters
* ----------
* in : input buffer
* out : output buffer, must be of size * elem_size bytes
* size : number of elements in input
* elem_size : element size of typed data
* block_size : Process in blocks of this many elements. Pass 0 to
* select automatically (recommended).
*
* Returns
* -------
* number of bytes consumed in *input* buffer, negative error-code if failed.
*
*/
int64_t bshuf_decompress_lz4(const void* in, void* out, const size_t size,
const size_t elem_size, size_t block_size);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // BITSHUFFLE_H
bitshuffle-0.3.5/src/bitshuffle_core.c000066400000000000000000001657561337005776700200130ustar00rootroot00000000000000/*
* Bitshuffle - Filter for improving compression of typed binary data.
*
* Author: Kiyoshi Masui
* Website: http://www.github.com/kiyo-masui/bitshuffle
* Created: 2014
*
* See LICENSE file for details about copyright and rights to use.
*
*/
#include "bitshuffle_core.h"
#include "bitshuffle_internals.h"
#include
#include
#if defined(__AVX2__) && defined (__SSE2__)
#define USEAVX2
#endif
#if defined(__SSE2__)
#define USESSE2
#endif
#if defined(__ARM_NEON__) || (__ARM_NEON)
#define USEARMNEON
#endif
// Conditional includes for SSE2 and AVX2.
#ifdef USEAVX2
#include
#elif defined USESSE2
#include
#elif defined USEARMNEON
#include
#endif
#if defined(_OPENMP) && defined(_MSC_VER)
typedef int64_t omp_size_t;
#else
typedef size_t omp_size_t;
#endif
// Macros.
#define CHECK_MULT_EIGHT(n) if (n % 8) return -80;
#define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
/* ---- Functions indicating compile time instruction set. ---- */
int bshuf_using_NEON(void) {
#ifdef USEARMNEON
return 1;
#else
return 0;
#endif
}
int bshuf_using_SSE2(void) {
#ifdef USESSE2
return 1;
#else
return 0;
#endif
}
int bshuf_using_AVX2(void) {
#ifdef USEAVX2
return 1;
#else
return 0;
#endif
}
/* ---- Worker code not requiring special instruction sets. ----
*
* The following code does not use any x86 specific vectorized instructions
* and should compile on any machine
*
*/
/* Transpose 8x8 bit array packed into a single quadword *x*.
* *t* is workspace. */
#define TRANS_BIT_8X8(x, t) { \
t = (x ^ (x >> 7)) & 0x00AA00AA00AA00AALL; \
x = x ^ t ^ (t << 7); \
t = (x ^ (x >> 14)) & 0x0000CCCC0000CCCCLL; \
x = x ^ t ^ (t << 14); \
t = (x ^ (x >> 28)) & 0x00000000F0F0F0F0LL; \
x = x ^ t ^ (t << 28); \
}
/* Transpose 8x8 bit array along the diagonal from upper right
to lower left */
#define TRANS_BIT_8X8_BE(x, t) { \
t = (x ^ (x >> 9)) & 0x0055005500550055LL; \
x = x ^ t ^ (t << 9); \
t = (x ^ (x >> 18)) & 0x0000333300003333LL; \
x = x ^ t ^ (t << 18); \
t = (x ^ (x >> 36)) & 0x000000000F0F0F0FLL; \
x = x ^ t ^ (t << 36); \
}
/* Transpose of an array of arbitrarily typed elements. */
#define TRANS_ELEM_TYPE(in, out, lda, ldb, type_t) { \
size_t ii, jj, kk; \
const type_t* in_type = (const type_t*) in; \
type_t* out_type = (type_t*) out; \
for(ii = 0; ii + 7 < lda; ii += 8) { \
for(jj = 0; jj < ldb; jj++) { \
for(kk = 0; kk < 8; kk++) { \
out_type[jj*lda + ii + kk] = \
in_type[ii*ldb + kk * ldb + jj]; \
} \
} \
} \
for(ii = lda - lda % 8; ii < lda; ii ++) { \
for(jj = 0; jj < ldb; jj++) { \
out_type[jj*lda + ii] = in_type[ii*ldb + jj]; \
} \
} \
}
/* Memory copy with bshuf call signature. For testing and profiling. */
int64_t bshuf_copy(const void* in, void* out, const size_t size,
const size_t elem_size) {
const char* in_b = (const char*) in;
char* out_b = (char*) out;
memcpy(out_b, in_b, size * elem_size);
return size * elem_size;
}
/* Transpose bytes within elements, starting partway through input. */
int64_t bshuf_trans_byte_elem_remainder(const void* in, void* out, const size_t size,
const size_t elem_size, const size_t start) {
size_t ii, jj, kk;
const char* in_b = (const char*) in;
char* out_b = (char*) out;
CHECK_MULT_EIGHT(start);
if (size > start) {
// ii loop separated into 2 loops so the compiler can unroll
// the inner one.
for (ii = start; ii + 7 < size; ii += 8) {
for (jj = 0; jj < elem_size; jj++) {
for (kk = 0; kk < 8; kk++) {
out_b[jj * size + ii + kk]
= in_b[ii * elem_size + kk * elem_size + jj];
}
}
}
for (ii = size - size % 8; ii < size; ii ++) {
for (jj = 0; jj < elem_size; jj++) {
out_b[jj * size + ii] = in_b[ii * elem_size + jj];
}
}
}
return size * elem_size;
}
/* Transpose bytes within elements. */
int64_t bshuf_trans_byte_elem_scal(const void* in, void* out, const size_t size,
const size_t elem_size) {
return bshuf_trans_byte_elem_remainder(in, out, size, elem_size, 0);
}
/* Transpose bits within bytes. */
int64_t bshuf_trans_bit_byte_remainder(const void* in, void* out, const size_t size,
const size_t elem_size, const size_t start_byte) {
const uint64_t* in_b = (const uint64_t*) in;
uint8_t* out_b = (uint8_t*) out;
uint64_t x, t;
size_t ii, kk;
size_t nbyte = elem_size * size;
size_t nbyte_bitrow = nbyte / 8;
uint64_t e=1;
const int little_endian = *(uint8_t *) &e == 1;
const size_t bit_row_skip = little_endian ? nbyte_bitrow : -nbyte_bitrow;
const int64_t bit_row_offset = little_endian ? 0 : 7 * nbyte_bitrow;
CHECK_MULT_EIGHT(nbyte);
CHECK_MULT_EIGHT(start_byte);
for (ii = start_byte / 8; ii < nbyte_bitrow; ii ++) {
x = in_b[ii];
if (little_endian) {
TRANS_BIT_8X8(x, t);
} else {
TRANS_BIT_8X8_BE(x, t);
}
for (kk = 0; kk < 8; kk ++) {
out_b[bit_row_offset + kk * bit_row_skip + ii] = x;
x = x >> 8;
}
}
return size * elem_size;
}
/* Transpose bits within bytes. */
int64_t bshuf_trans_bit_byte_scal(const void* in, void* out, const size_t size,
const size_t elem_size) {
return bshuf_trans_bit_byte_remainder(in, out, size, elem_size, 0);
}
/* General transpose of an array, optimized for large element sizes. */
int64_t bshuf_trans_elem(const void* in, void* out, const size_t lda,
const size_t ldb, const size_t elem_size) {
size_t ii, jj;
const char* in_b = (const char*) in;
char* out_b = (char*) out;
for(ii = 0; ii < lda; ii++) {
for(jj = 0; jj < ldb; jj++) {
memcpy(&out_b[(jj*lda + ii) * elem_size],
&in_b[(ii*ldb + jj) * elem_size], elem_size);
}
}
return lda * ldb * elem_size;
}
/* Transpose rows of shuffled bits (size / 8 bytes) within groups of 8. */
int64_t bshuf_trans_bitrow_eight(const void* in, void* out, const size_t size,
const size_t elem_size) {
size_t nbyte_bitrow = size / 8;
CHECK_MULT_EIGHT(size);
return bshuf_trans_elem(in, out, 8, elem_size, nbyte_bitrow);
}
/* Transpose bits within elements. */
int64_t bshuf_trans_bit_elem_scal(const void* in, void* out, const size_t size,
const size_t elem_size) {
int64_t count;
void *tmp_buf;
CHECK_MULT_EIGHT(size);
tmp_buf = malloc(size * elem_size);
if (tmp_buf == NULL) return -1;
count = bshuf_trans_byte_elem_scal(in, out, size, elem_size);
CHECK_ERR_FREE(count, tmp_buf);
count = bshuf_trans_bit_byte_scal(out, tmp_buf, size, elem_size);
CHECK_ERR_FREE(count, tmp_buf);
count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
free(tmp_buf);
return count;
}
/* For data organized into a row for each bit (8 * elem_size rows), transpose
* the bytes. */
int64_t bshuf_trans_byte_bitrow_scal(const void* in, void* out, const size_t size,
const size_t elem_size) {
size_t ii, jj, kk, nbyte_row;
const char *in_b;
char *out_b;
in_b = (const char*) in;
out_b = (char*) out;
nbyte_row = size / 8;
CHECK_MULT_EIGHT(size);
for (jj = 0; jj < elem_size; jj++) {
for (ii = 0; ii < nbyte_row; ii++) {
for (kk = 0; kk < 8; kk++) {
out_b[ii * 8 * elem_size + jj * 8 + kk] = \
in_b[(jj * 8 + kk) * nbyte_row + ii];
}
}
}
return size * elem_size;
}
/* Shuffle bits within the bytes of eight element blocks. */
int64_t bshuf_shuffle_bit_eightelem_scal(const void* in, void* out, \
const size_t size, const size_t elem_size) {
const char *in_b;
char *out_b;
uint64_t x, t;
size_t ii, jj, kk;
size_t nbyte, out_index;
uint64_t e=1;
const int little_endian = *(uint8_t *) &e == 1;
const size_t elem_skip = little_endian ? elem_size : -elem_size;
const uint64_t elem_offset = little_endian ? 0 : 7 * elem_size;
CHECK_MULT_EIGHT(size);
in_b = (const char*) in;
out_b = (char*) out;
nbyte = elem_size * size;
for (jj = 0; jj < 8 * elem_size; jj += 8) {
for (ii = 0; ii + 8 * elem_size - 1 < nbyte; ii += 8 * elem_size) {
x = *((uint64_t*) &in_b[ii + jj]);
if (little_endian) {
TRANS_BIT_8X8(x, t);
} else {
TRANS_BIT_8X8_BE(x, t);
}
for (kk = 0; kk < 8; kk++) {
out_index = ii + jj / 8 + elem_offset + kk * elem_skip;
*((uint8_t*) &out_b[out_index]) = x;
x = x >> 8;
}
}
}
return size * elem_size;
}
/* Untranspose bits within elements. */
int64_t bshuf_untrans_bit_elem_scal(const void* in, void* out, const size_t size,
const size_t elem_size) {
int64_t count;
void *tmp_buf;
CHECK_MULT_EIGHT(size);
tmp_buf = malloc(size * elem_size);
if (tmp_buf == NULL) return -1;
count = bshuf_trans_byte_bitrow_scal(in, tmp_buf, size, elem_size);
CHECK_ERR_FREE(count, tmp_buf);
count = bshuf_shuffle_bit_eightelem_scal(tmp_buf, out, size, elem_size);
free(tmp_buf);
return count;
}
/* ---- Worker code that uses Arm NEON ----
*
* The following code makes use of the Arm NEON instruction set.
* NEON technology is the implementation of the ARM Advanced Single
* Instruction Multiple Data (SIMD) extension.
* The NEON unit is the component of the processor that executes SIMD instructions.
* It is also called the NEON Media Processing Engine (MPE).
*
*/
#ifdef USEARMNEON
/* Transpose bytes within elements for 16 bit elements. */
int64_t bshuf_trans_byte_elem_NEON_16(const void* in, void* out, const size_t size) {
size_t ii;
const char *in_b = (const char*) in;
char *out_b = (char*) out;
int8x16_t a0, b0, a1, b1;
for (ii=0; ii + 15 < size; ii += 16) {
a0 = vld1q_s8(in_b + 2*ii + 0*16);
b0 = vld1q_s8(in_b + 2*ii + 1*16);
a1 = vzip1q_s8(a0, b0);
b1 = vzip2q_s8(a0, b0);
a0 = vzip1q_s8(a1, b1);
b0 = vzip2q_s8(a1, b1);
a1 = vzip1q_s8(a0, b0);
b1 = vzip2q_s8(a0, b0);
a0 = vzip1q_s8(a1, b1);
b0 = vzip2q_s8(a1, b1);
vst1q_s8(out_b + 0*size + ii, a0);
vst1q_s8(out_b + 1*size + ii, b0);
}
return bshuf_trans_byte_elem_remainder(in, out, size, 2,
size - size % 16);
}
/* Transpose bytes within elements for 32 bit elements. */
int64_t bshuf_trans_byte_elem_NEON_32(const void* in, void* out, const size_t size) {
size_t ii;
const char *in_b;
char *out_b;
in_b = (const char*) in;
out_b = (char*) out;
int8x16_t a0, b0, c0, d0, a1, b1, c1, d1;
int64x2_t a2, b2, c2, d2;
for (ii=0; ii + 15 < size; ii += 16) {
a0 = vld1q_s8(in_b + 4*ii + 0*16);
b0 = vld1q_s8(in_b + 4*ii + 1*16);
c0 = vld1q_s8(in_b + 4*ii + 2*16);
d0 = vld1q_s8(in_b + 4*ii + 3*16);
a1 = vzip1q_s8(a0, b0);
b1 = vzip2q_s8(a0, b0);
c1 = vzip1q_s8(c0, d0);
d1 = vzip2q_s8(c0, d0);
a0 = vzip1q_s8(a1, b1);
b0 = vzip2q_s8(a1, b1);
c0 = vzip1q_s8(c1, d1);
d0 = vzip2q_s8(c1, d1);
a1 = vzip1q_s8(a0, b0);
b1 = vzip2q_s8(a0, b0);
c1 = vzip1q_s8(c0, d0);
d1 = vzip2q_s8(c0, d0);
a2 = vzip1q_s64(vreinterpretq_s64_s8(a1), vreinterpretq_s64_s8(c1));
b2 = vzip2q_s64(vreinterpretq_s64_s8(a1), vreinterpretq_s64_s8(c1));
c2 = vzip1q_s64(vreinterpretq_s64_s8(b1), vreinterpretq_s64_s8(d1));
d2 = vzip2q_s64(vreinterpretq_s64_s8(b1), vreinterpretq_s64_s8(d1));
vst1q_s64((int64_t *) (out_b + 0*size + ii), a2);
vst1q_s64((int64_t *) (out_b + 1*size + ii), b2);
vst1q_s64((int64_t *) (out_b + 2*size + ii), c2);
vst1q_s64((int64_t *) (out_b + 3*size + ii), d2);
}
return bshuf_trans_byte_elem_remainder(in, out, size, 4,
size - size % 16);
}
/* Transpose bytes within elements for 64 bit elements. */
int64_t bshuf_trans_byte_elem_NEON_64(const void* in, void* out, const size_t size) {
size_t ii;
const char* in_b = (const char*) in;
char* out_b = (char*) out;
int8x16_t a0, b0, c0, d0, e0, f0, g0, h0;
int8x16_t a1, b1, c1, d1, e1, f1, g1, h1;
for (ii=0; ii + 15 < size; ii += 16) {
a0 = vld1q_s8(in_b + 8*ii + 0*16);
b0 = vld1q_s8(in_b + 8*ii + 1*16);
c0 = vld1q_s8(in_b + 8*ii + 2*16);
d0 = vld1q_s8(in_b + 8*ii + 3*16);
e0 = vld1q_s8(in_b + 8*ii + 4*16);
f0 = vld1q_s8(in_b + 8*ii + 5*16);
g0 = vld1q_s8(in_b + 8*ii + 6*16);
h0 = vld1q_s8(in_b + 8*ii + 7*16);
a1 = vzip1q_s8 (a0, b0);
b1 = vzip2q_s8 (a0, b0);
c1 = vzip1q_s8 (c0, d0);
d1 = vzip2q_s8 (c0, d0);
e1 = vzip1q_s8 (e0, f0);
f1 = vzip2q_s8 (e0, f0);
g1 = vzip1q_s8 (g0, h0);
h1 = vzip2q_s8 (g0, h0);
a0 = vzip1q_s8 (a1, b1);
b0 = vzip2q_s8 (a1, b1);
c0 = vzip1q_s8 (c1, d1);
d0 = vzip2q_s8 (c1, d1);
e0 = vzip1q_s8 (e1, f1);
f0 = vzip2q_s8 (e1, f1);
g0 = vzip1q_s8 (g1, h1);
h0 = vzip2q_s8 (g1, h1);
a1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (a0), vreinterpretq_s32_s8 (c0));
b1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (a0), vreinterpretq_s32_s8 (c0));
c1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (b0), vreinterpretq_s32_s8 (d0));
d1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (b0), vreinterpretq_s32_s8 (d0));
e1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (e0), vreinterpretq_s32_s8 (g0));
f1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (e0), vreinterpretq_s32_s8 (g0));
g1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (f0), vreinterpretq_s32_s8 (h0));
h1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (f0), vreinterpretq_s32_s8 (h0));
a0 = (int8x16_t) vzip1q_s64 (vreinterpretq_s64_s8 (a1), vreinterpretq_s64_s8 (e1));
b0 = (int8x16_t) vzip2q_s64 (vreinterpretq_s64_s8 (a1), vreinterpretq_s64_s8 (e1));
c0 = (int8x16_t) vzip1q_s64 (vreinterpretq_s64_s8 (b1), vreinterpretq_s64_s8 (f1));
d0 = (int8x16_t) vzip2q_s64 (vreinterpretq_s64_s8 (b1), vreinterpretq_s64_s8 (f1));
e0 = (int8x16_t) vzip1q_s64 (vreinterpretq_s64_s8 (c1), vreinterpretq_s64_s8 (g1));
f0 = (int8x16_t) vzip2q_s64 (vreinterpretq_s64_s8 (c1), vreinterpretq_s64_s8 (g1));
g0 = (int8x16_t) vzip1q_s64 (vreinterpretq_s64_s8 (d1), vreinterpretq_s64_s8 (h1));
h0 = (int8x16_t) vzip2q_s64 (vreinterpretq_s64_s8 (d1), vreinterpretq_s64_s8 (h1));
vst1q_s8(out_b + 0*size + ii, a0);
vst1q_s8(out_b + 1*size + ii, b0);
vst1q_s8(out_b + 2*size + ii, c0);
vst1q_s8(out_b + 3*size + ii, d0);
vst1q_s8(out_b + 4*size + ii, e0);
vst1q_s8(out_b + 5*size + ii, f0);
vst1q_s8(out_b + 6*size + ii, g0);
vst1q_s8(out_b + 7*size + ii, h0);
}
return bshuf_trans_byte_elem_remainder(in, out, size, 8,
size - size % 16);
}
/* Transpose bytes within elements using best NEON algorithm available. */
int64_t bshuf_trans_byte_elem_NEON(const void* in, void* out, const size_t size,
const size_t elem_size) {
int64_t count;
// Trivial cases: power of 2 bytes.
switch (elem_size) {
case 1:
count = bshuf_copy(in, out, size, elem_size);
return count;
case 2:
count = bshuf_trans_byte_elem_NEON_16(in, out, size);
return count;
case 4:
count = bshuf_trans_byte_elem_NEON_32(in, out, size);
return count;
case 8:
count = bshuf_trans_byte_elem_NEON_64(in, out, size);
return count;
}
// Worst case: odd number of bytes. Turns out that this is faster for
// (odd * 2) byte elements as well (hence % 4).
if (elem_size % 4) {
count = bshuf_trans_byte_elem_scal(in, out, size, elem_size);
return count;
}
// Multiple of power of 2: transpose hierarchically.
{
size_t nchunk_elem;
void* tmp_buf = malloc(size * elem_size);
if (tmp_buf == NULL) return -1;
if ((elem_size % 8) == 0) {
nchunk_elem = elem_size / 8;
TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int64_t);
count = bshuf_trans_byte_elem_NEON_64(out, tmp_buf,
size * nchunk_elem);
bshuf_trans_elem(tmp_buf, out, 8, nchunk_elem, size);
} else if ((elem_size % 4) == 0) {
nchunk_elem = elem_size / 4;
TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int32_t);
count = bshuf_trans_byte_elem_NEON_32(out, tmp_buf,
size * nchunk_elem);
bshuf_trans_elem(tmp_buf, out, 4, nchunk_elem, size);
} else {
// Not used since scalar algorithm is faster.
nchunk_elem = elem_size / 2;
TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int16_t);
count = bshuf_trans_byte_elem_NEON_16(out, tmp_buf,
size * nchunk_elem);
bshuf_trans_elem(tmp_buf, out, 2, nchunk_elem, size);
}
free(tmp_buf);
return count;
}
}
/* Creates a mask made up of the most significant
* bit of each byte of 'input'
*/
int32_t move_byte_mask_neon(uint8x16_t input) {
return ( ((input[0] & 0x80) >> 7) | (((input[1] & 0x80) >> 7) << 1) | (((input[2] & 0x80) >> 7) << 2) | (((input[3] & 0x80) >> 7) << 3)
| (((input[4] & 0x80) >> 7) << 4) | (((input[5] & 0x80) >> 7) << 5) | (((input[6] & 0x80) >> 7) << 6) | (((input[7] & 0x80) >> 7) << 7)
| (((input[8] & 0x80) >> 7) << 8) | (((input[9] & 0x80) >> 7) << 9) | (((input[10] & 0x80) >> 7) << 10) | (((input[11] & 0x80) >> 7) << 11)
| (((input[12] & 0x80) >> 7) << 12) | (((input[13] & 0x80) >> 7) << 13) | (((input[14] & 0x80) >> 7) << 14) | (((input[15] & 0x80) >> 7) << 15)
);
}
/* Transpose bits within bytes. */
int64_t bshuf_trans_bit_byte_NEON(const void* in, void* out, const size_t size,
const size_t elem_size) {
size_t ii, kk;
const char* in_b = (const char*) in;
char* out_b = (char*) out;
uint16_t* out_ui16;
int64_t count;
size_t nbyte = elem_size * size;
CHECK_MULT_EIGHT(nbyte);
int16x8_t xmm;
int32_t bt;
for (ii = 0; ii + 15 < nbyte; ii += 16) {
xmm = vld1q_s16((int16_t *) (in_b + ii));
for (kk = 0; kk < 8; kk++) {
bt = move_byte_mask_neon((uint8x16_t) xmm);
xmm = vshlq_n_s16(xmm, 1);
out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
*out_ui16 = bt;
}
}
count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
nbyte - nbyte % 16);
return count;
}
/* Transpose bits within elements. */
int64_t bshuf_trans_bit_elem_NEON(const void* in, void* out, const size_t size,
const size_t elem_size) {
int64_t count;
CHECK_MULT_EIGHT(size);
void* tmp_buf = malloc(size * elem_size);
if (tmp_buf == NULL) return -1;
count = bshuf_trans_byte_elem_NEON(in, out, size, elem_size);
CHECK_ERR_FREE(count, tmp_buf);
count = bshuf_trans_bit_byte_NEON(out, tmp_buf, size, elem_size);
CHECK_ERR_FREE(count, tmp_buf);
count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
free(tmp_buf);
return count;
}
/* For data organized into a row for each bit (8 * elem_size rows), transpose
* the bytes. */
int64_t bshuf_trans_byte_bitrow_NEON(const void* in, void* out, const size_t size,
const size_t elem_size) {
size_t ii, jj;
const char* in_b = (const char*) in;
char* out_b = (char*) out;
CHECK_MULT_EIGHT(size);
size_t nrows = 8 * elem_size;
size_t nbyte_row = size / 8;
int8x16_t a0, b0, c0, d0, e0, f0, g0, h0;
int8x16_t a1, b1, c1, d1, e1, f1, g1, h1;
int64x1_t *as, *bs, *cs, *ds, *es, *fs, *gs, *hs;
for (ii = 0; ii + 7 < nrows; ii += 8) {
for (jj = 0; jj + 15 < nbyte_row; jj += 16) {
a0 = vld1q_s8(in_b + (ii + 0)*nbyte_row + jj);
b0 = vld1q_s8(in_b + (ii + 1)*nbyte_row + jj);
c0 = vld1q_s8(in_b + (ii + 2)*nbyte_row + jj);
d0 = vld1q_s8(in_b + (ii + 3)*nbyte_row + jj);
e0 = vld1q_s8(in_b + (ii + 4)*nbyte_row + jj);
f0 = vld1q_s8(in_b + (ii + 5)*nbyte_row + jj);
g0 = vld1q_s8(in_b + (ii + 6)*nbyte_row + jj);
h0 = vld1q_s8(in_b + (ii + 7)*nbyte_row + jj);
a1 = vzip1q_s8(a0, b0);
b1 = vzip1q_s8(c0, d0);
c1 = vzip1q_s8(e0, f0);
d1 = vzip1q_s8(g0, h0);
e1 = vzip2q_s8(a0, b0);
f1 = vzip2q_s8(c0, d0);
g1 = vzip2q_s8(e0, f0);
h1 = vzip2q_s8(g0, h0);
a0 = (int8x16_t) vzip1q_s16 (vreinterpretq_s16_s8 (a1), vreinterpretq_s16_s8 (b1));
b0= (int8x16_t) vzip1q_s16 (vreinterpretq_s16_s8 (c1), vreinterpretq_s16_s8 (d1));
c0 = (int8x16_t) vzip2q_s16 (vreinterpretq_s16_s8 (a1), vreinterpretq_s16_s8 (b1));
d0 = (int8x16_t) vzip2q_s16 (vreinterpretq_s16_s8 (c1), vreinterpretq_s16_s8 (d1));
e0 = (int8x16_t) vzip1q_s16 (vreinterpretq_s16_s8 (e1), vreinterpretq_s16_s8 (f1));
f0 = (int8x16_t) vzip1q_s16 (vreinterpretq_s16_s8 (g1), vreinterpretq_s16_s8 (h1));
g0 = (int8x16_t) vzip2q_s16 (vreinterpretq_s16_s8 (e1), vreinterpretq_s16_s8 (f1));
h0 = (int8x16_t) vzip2q_s16 (vreinterpretq_s16_s8 (g1), vreinterpretq_s16_s8 (h1));
a1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (a0), vreinterpretq_s32_s8 (b0));
b1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (a0), vreinterpretq_s32_s8 (b0));
c1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (c0), vreinterpretq_s32_s8 (d0));
d1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (c0), vreinterpretq_s32_s8 (d0));
e1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (e0), vreinterpretq_s32_s8 (f0));
f1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (e0), vreinterpretq_s32_s8 (f0));
g1 = (int8x16_t) vzip1q_s32 (vreinterpretq_s32_s8 (g0), vreinterpretq_s32_s8 (h0));
h1 = (int8x16_t) vzip2q_s32 (vreinterpretq_s32_s8 (g0), vreinterpretq_s32_s8 (h0));
as = (int64x1_t *) &a1;
bs = (int64x1_t *) &b1;
cs = (int64x1_t *) &c1;
ds = (int64x1_t *) &d1;
es = (int64x1_t *) &e1;
fs = (int64x1_t *) &f1;
gs = (int64x1_t *) &g1;
hs = (int64x1_t *) &h1;
vst1_s64((int64_t *)(out_b + (jj + 0) * nrows + ii), *as);
vst1_s64((int64_t *)(out_b + (jj + 1) * nrows + ii), *(as + 1));
vst1_s64((int64_t *)(out_b + (jj + 2) * nrows + ii), *bs);
vst1_s64((int64_t *)(out_b + (jj + 3) * nrows + ii), *(bs + 1));
vst1_s64((int64_t *)(out_b + (jj + 4) * nrows + ii), *cs);
vst1_s64((int64_t *)(out_b + (jj + 5) * nrows + ii), *(cs + 1));
vst1_s64((int64_t *)(out_b + (jj + 6) * nrows + ii), *ds);
vst1_s64((int64_t *)(out_b + (jj + 7) * nrows + ii), *(ds + 1));
vst1_s64((int64_t *)(out_b + (jj + 8) * nrows + ii), *es);
vst1_s64((int64_t *)(out_b + (jj + 9) * nrows + ii), *(es + 1));
vst1_s64((int64_t *)(out_b + (jj + 10) * nrows + ii), *fs);
vst1_s64((int64_t *)(out_b + (jj + 11) * nrows + ii), *(fs + 1));
vst1_s64((int64_t *)(out_b + (jj + 12) * nrows + ii), *gs);
vst1_s64((int64_t *)(out_b + (jj + 13) * nrows + ii), *(gs + 1));
vst1_s64((int64_t *)(out_b + (jj + 14) * nrows + ii), *hs);
vst1_s64((int64_t *)(out_b + (jj + 15) * nrows + ii), *(hs + 1));
}
for (jj = nbyte_row - nbyte_row % 16; jj < nbyte_row; jj ++) {
out_b[jj * nrows + ii + 0] = in_b[(ii + 0)*nbyte_row + jj];
out_b[jj * nrows + ii + 1] = in_b[(ii + 1)*nbyte_row + jj];
out_b[jj * nrows + ii + 2] = in_b[(ii + 2)*nbyte_row + jj];
out_b[jj * nrows + ii + 3] = in_b[(ii + 3)*nbyte_row + jj];
out_b[jj * nrows + ii + 4] = in_b[(ii + 4)*nbyte_row + jj];
out_b[jj * nrows + ii + 5] = in_b[(ii + 5)*nbyte_row + jj];
out_b[jj * nrows + ii + 6] = in_b[(ii + 6)*nbyte_row + jj];
out_b[jj * nrows + ii + 7] = in_b[(ii + 7)*nbyte_row + jj];
}
}
return size * elem_size;
}
/* Shuffle bits within the bytes of eight element blocks. */
int64_t bshuf_shuffle_bit_eightelem_NEON(const void* in, void* out, const size_t size,
const size_t elem_size) {
CHECK_MULT_EIGHT(size);
// With a bit of care, this could be written such that such that it is
// in_buf = out_buf safe.
const char* in_b = (const char*) in;
uint16_t* out_ui16 = (uint16_t*) out;
size_t ii, jj, kk;
size_t nbyte = elem_size * size;
int16x8_t xmm;
int32_t bt;
if (elem_size % 2) {
bshuf_shuffle_bit_eightelem_scal(in, out, size, elem_size);
} else {
for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
ii += 8 * elem_size) {
for (jj = 0; jj + 15 < 8 * elem_size; jj += 16) {
xmm = vld1q_s16((int16_t *) &in_b[ii + jj]);
for (kk = 0; kk < 8; kk++) {
bt = move_byte_mask_neon((uint8x16_t) xmm);
xmm = vshlq_n_s16(xmm, 1);
size_t ind = (ii + jj / 8 + (7 - kk) * elem_size);
out_ui16[ind / 2] = bt;
}
}
}
}
return size * elem_size;
}
/* Untranspose bits within elements. */
int64_t bshuf_untrans_bit_elem_NEON(const void* in, void* out, const size_t size,
const size_t elem_size) {
int64_t count;
CHECK_MULT_EIGHT(size);
void* tmp_buf = malloc(size * elem_size);
if (tmp_buf == NULL) return -1;
count = bshuf_trans_byte_bitrow_NEON(in, tmp_buf, size, elem_size);
CHECK_ERR_FREE(count, tmp_buf);
count = bshuf_shuffle_bit_eightelem_NEON(tmp_buf, out, size, elem_size);
free(tmp_buf);
return count;
}
#else // #ifdef USEARMNEON
int64_t bshuf_untrans_bit_elem_NEON(const void* in, void* out, const size_t size,
const size_t elem_size) {
return -13;
}
int64_t bshuf_trans_bit_elem_NEON(const void* in, void* out, const size_t size,
const size_t elem_size) {
return -13;
}
int64_t bshuf_trans_byte_bitrow_NEON(const void* in, void* out, const size_t size,
const size_t elem_size) {
return -13;
}
int64_t bshuf_trans_bit_byte_NEON(const void* in, void* out, const size_t size,
const size_t elem_size) {
return -13;
}
int64_t bshuf_trans_byte_elem_NEON(const void* in, void* out, const size_t size,
const size_t elem_size) {
return -13;
}
int64_t bshuf_trans_byte_elem_NEON_64(const void* in, void* out, const size_t size) {
return -13;
}
int64_t bshuf_trans_byte_elem_NEON_32(const void* in, void* out, const size_t size) {
return -13;
}
int64_t bshuf_trans_byte_elem_NEON_16(const void* in, void* out, const size_t size) {
return -13;
}
int64_t bshuf_shuffle_bit_eightelem_NEON(const void* in, void* out, const size_t size,
const size_t elem_size) {
return -13;
}
#endif
/* ---- Worker code that uses SSE2 ----
*
* The following code makes use of the SSE2 instruction set and specialized
* 16 byte registers. The SSE2 instructions are present on modern x86
* processors. The first Intel processor microarchitecture supporting SSE2 was
* Pentium 4 (2000).
*
*/
#ifdef USESSE2
/* Transpose bytes within elements for 16 bit elements. */
int64_t bshuf_trans_byte_elem_SSE_16(const void* in, void* out, const size_t size) {
size_t ii;
const char *in_b = (const char*) in;
char *out_b = (char*) out;
__m128i a0, b0, a1, b1;
for (ii=0; ii + 15 < size; ii += 16) {
a0 = _mm_loadu_si128((__m128i *) &in_b[2*ii + 0*16]);
b0 = _mm_loadu_si128((__m128i *) &in_b[2*ii + 1*16]);
a1 = _mm_unpacklo_epi8(a0, b0);
b1 = _mm_unpackhi_epi8(a0, b0);
a0 = _mm_unpacklo_epi8(a1, b1);
b0 = _mm_unpackhi_epi8(a1, b1);
a1 = _mm_unpacklo_epi8(a0, b0);
b1 = _mm_unpackhi_epi8(a0, b0);
a0 = _mm_unpacklo_epi8(a1, b1);
b0 = _mm_unpackhi_epi8(a1, b1);
_mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0);
_mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0);
}
return bshuf_trans_byte_elem_remainder(in, out, size, 2,
size - size % 16);
}
/* Transpose bytes within elements for 32 bit elements. */
int64_t bshuf_trans_byte_elem_SSE_32(const void* in, void* out, const size_t size) {
size_t ii;
const char *in_b;
char *out_b;
in_b = (const char*) in;
out_b = (char*) out;
__m128i a0, b0, c0, d0, a1, b1, c1, d1;
for (ii=0; ii + 15 < size; ii += 16) {
a0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 0*16]);
b0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 1*16]);
c0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 2*16]);
d0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 3*16]);
a1 = _mm_unpacklo_epi8(a0, b0);
b1 = _mm_unpackhi_epi8(a0, b0);
c1 = _mm_unpacklo_epi8(c0, d0);
d1 = _mm_unpackhi_epi8(c0, d0);
a0 = _mm_unpacklo_epi8(a1, b1);
b0 = _mm_unpackhi_epi8(a1, b1);
c0 = _mm_unpacklo_epi8(c1, d1);
d0 = _mm_unpackhi_epi8(c1, d1);
a1 = _mm_unpacklo_epi8(a0, b0);
b1 = _mm_unpackhi_epi8(a0, b0);
c1 = _mm_unpacklo_epi8(c0, d0);
d1 = _mm_unpackhi_epi8(c0, d0);
a0 = _mm_unpacklo_epi64(a1, c1);
b0 = _mm_unpackhi_epi64(a1, c1);
c0 = _mm_unpacklo_epi64(b1, d1);
d0 = _mm_unpackhi_epi64(b1, d1);
_mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0);
_mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0);
_mm_storeu_si128((__m128i *) &out_b[2*size + ii], c0);
_mm_storeu_si128((__m128i *) &out_b[3*size + ii], d0);
}
return bshuf_trans_byte_elem_remainder(in, out, size, 4,
size - size % 16);
}
/* Transpose bytes within elements for 64 bit elements. */
int64_t bshuf_trans_byte_elem_SSE_64(const void* in, void* out, const size_t size) {
size_t ii;
const char* in_b = (const char*) in;
char* out_b = (char*) out;
__m128i a0, b0, c0, d0, e0, f0, g0, h0;
__m128i a1, b1, c1, d1, e1, f1, g1, h1;
for (ii=0; ii + 15 < size; ii += 16) {
a0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 0*16]);
b0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 1*16]);
c0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 2*16]);
d0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 3*16]);
e0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 4*16]);
f0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 5*16]);
g0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 6*16]);
h0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 7*16]);
a1 = _mm_unpacklo_epi8(a0, b0);
b1 = _mm_unpackhi_epi8(a0, b0);
c1 = _mm_unpacklo_epi8(c0, d0);
d1 = _mm_unpackhi_epi8(c0, d0);
e1 = _mm_unpacklo_epi8(e0, f0);
f1 = _mm_unpackhi_epi8(e0, f0);
g1 = _mm_unpacklo_epi8(g0, h0);
h1 = _mm_unpackhi_epi8(g0, h0);
a0 = _mm_unpacklo_epi8(a1, b1);
b0 = _mm_unpackhi_epi8(a1, b1);
c0 = _mm_unpacklo_epi8(c1, d1);
d0 = _mm_unpackhi_epi8(c1, d1);
e0 = _mm_unpacklo_epi8(e1, f1);
f0 = _mm_unpackhi_epi8(e1, f1);
g0 = _mm_unpacklo_epi8(g1, h1);
h0 = _mm_unpackhi_epi8(g1, h1);
a1 = _mm_unpacklo_epi32(a0, c0);
b1 = _mm_unpackhi_epi32(a0, c0);
c1 = _mm_unpacklo_epi32(b0, d0);
d1 = _mm_unpackhi_epi32(b0, d0);
e1 = _mm_unpacklo_epi32(e0, g0);
f1 = _mm_unpackhi_epi32(e0, g0);
g1 = _mm_unpacklo_epi32(f0, h0);
h1 = _mm_unpackhi_epi32(f0, h0);
a0 = _mm_unpacklo_epi64(a1, e1);
b0 = _mm_unpackhi_epi64(a1, e1);
c0 = _mm_unpacklo_epi64(b1, f1);
d0 = _mm_unpackhi_epi64(b1, f1);
e0 = _mm_unpacklo_epi64(c1, g1);
f0 = _mm_unpackhi_epi64(c1, g1);
g0 = _mm_unpacklo_epi64(d1, h1);
h0 = _mm_unpackhi_epi64(d1, h1);
_mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0);
_mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0);
_mm_storeu_si128((__m128i *) &out_b[2*size + ii], c0);
_mm_storeu_si128((__m128i *) &out_b[3*size + ii], d0);
_mm_storeu_si128((__m128i *) &out_b[4*size + ii], e0);
_mm_storeu_si128((__m128i *) &out_b[5*size + ii], f0);
_mm_storeu_si128((__m128i *) &out_b[6*size + ii], g0);
_mm_storeu_si128((__m128i *) &out_b[7*size + ii], h0);
}
return bshuf_trans_byte_elem_remainder(in, out, size, 8,
size - size % 16);
}
/* Transpose bytes within elements using best SSE algorithm available. */
int64_t bshuf_trans_byte_elem_SSE(const void* in, void* out, const size_t size,
const size_t elem_size) {
int64_t count;
// Trivial cases: power of 2 bytes.
switch (elem_size) {
case 1:
count = bshuf_copy(in, out, size, elem_size);
return count;
case 2:
count = bshuf_trans_byte_elem_SSE_16(in, out, size);
return count;
case 4:
count = bshuf_trans_byte_elem_SSE_32(in, out, size);
return count;
case 8:
count = bshuf_trans_byte_elem_SSE_64(in, out, size);
return count;
}
// Worst case: odd number of bytes. Turns out that this is faster for
// (odd * 2) byte elements as well (hence % 4).
if (elem_size % 4) {
count = bshuf_trans_byte_elem_scal(in, out, size, elem_size);
return count;
}
// Multiple of power of 2: transpose hierarchically.
{
size_t nchunk_elem;
void* tmp_buf = malloc(size * elem_size);
if (tmp_buf == NULL) return -1;
if ((elem_size % 8) == 0) {
nchunk_elem = elem_size / 8;
TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int64_t);
count = bshuf_trans_byte_elem_SSE_64(out, tmp_buf,
size * nchunk_elem);
bshuf_trans_elem(tmp_buf, out, 8, nchunk_elem, size);
} else if ((elem_size % 4) == 0) {
nchunk_elem = elem_size / 4;
TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int32_t);
count = bshuf_trans_byte_elem_SSE_32(out, tmp_buf,
size * nchunk_elem);
bshuf_trans_elem(tmp_buf, out, 4, nchunk_elem, size);
} else {
// Not used since scalar algorithm is faster.
nchunk_elem = elem_size / 2;
TRANS_ELEM_TYPE(in, out, size, nchunk_elem, int16_t);
count = bshuf_trans_byte_elem_SSE_16(out, tmp_buf,
size * nchunk_elem);
bshuf_trans_elem(tmp_buf, out, 2, nchunk_elem, size);
}
free(tmp_buf);
return count;
}
}
/* Transpose bits within bytes. */
int64_t bshuf_trans_bit_byte_SSE(const void* in, void* out, const size_t size,
const size_t elem_size) {
size_t ii, kk;
const char* in_b = (const char*) in;
char* out_b = (char*) out;
uint16_t* out_ui16;
int64_t count;
size_t nbyte = elem_size * size;
CHECK_MULT_EIGHT(nbyte);
__m128i xmm;
int32_t bt;
for (ii = 0; ii + 15 < nbyte; ii += 16) {
xmm = _mm_loadu_si128((__m128i *) &in_b[ii]);
for (kk = 0; kk < 8; kk++) {
bt = _mm_movemask_epi8(xmm);
xmm = _mm_slli_epi16(xmm, 1);
out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
*out_ui16 = bt;
}
}
count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
nbyte - nbyte % 16);
return count;
}
/* Transpose bits within elements. */
int64_t bshuf_trans_bit_elem_SSE(const void* in, void* out, const size_t size,
const size_t elem_size) {
int64_t count;
CHECK_MULT_EIGHT(size);
void* tmp_buf = malloc(size * elem_size);
if (tmp_buf == NULL) return -1;
count = bshuf_trans_byte_elem_SSE(in, out, size, elem_size);
CHECK_ERR_FREE(count, tmp_buf);
count = bshuf_trans_bit_byte_SSE(out, tmp_buf, size, elem_size);
CHECK_ERR_FREE(count, tmp_buf);
count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
free(tmp_buf);
return count;
}
/* For data organized into a row for each bit (8 * elem_size rows), transpose
* the bytes. */
int64_t bshuf_trans_byte_bitrow_SSE(const void* in, void* out, const size_t size,
const size_t elem_size) {
size_t ii, jj;
const char* in_b = (const char*) in;
char* out_b = (char*) out;
CHECK_MULT_EIGHT(size);
size_t nrows = 8 * elem_size;
size_t nbyte_row = size / 8;
__m128i a0, b0, c0, d0, e0, f0, g0, h0;
__m128i a1, b1, c1, d1, e1, f1, g1, h1;
__m128 *as, *bs, *cs, *ds, *es, *fs, *gs, *hs;
for (ii = 0; ii + 7 < nrows; ii += 8) {
for (jj = 0; jj + 15 < nbyte_row; jj += 16) {
a0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 0)*nbyte_row + jj]);
b0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 1)*nbyte_row + jj]);
c0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 2)*nbyte_row + jj]);
d0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 3)*nbyte_row + jj]);
e0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 4)*nbyte_row + jj]);
f0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 5)*nbyte_row + jj]);
g0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 6)*nbyte_row + jj]);
h0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 7)*nbyte_row + jj]);
a1 = _mm_unpacklo_epi8(a0, b0);
b1 = _mm_unpacklo_epi8(c0, d0);
c1 = _mm_unpacklo_epi8(e0, f0);
d1 = _mm_unpacklo_epi8(g0, h0);
e1 = _mm_unpackhi_epi8(a0, b0);
f1 = _mm_unpackhi_epi8(c0, d0);
g1 = _mm_unpackhi_epi8(e0, f0);
h1 = _mm_unpackhi_epi8(g0, h0);
a0 = _mm_unpacklo_epi16(a1, b1);
b0 = _mm_unpacklo_epi16(c1, d1);
c0 = _mm_unpackhi_epi16(a1, b1);
d0 = _mm_unpackhi_epi16(c1, d1);
e0 = _mm_unpacklo_epi16(e1, f1);
f0 = _mm_unpacklo_epi16(g1, h1);
g0 = _mm_unpackhi_epi16(e1, f1);
h0 = _mm_unpackhi_epi16(g1, h1);
a1 = _mm_unpacklo_epi32(a0, b0);
b1 = _mm_unpackhi_epi32(a0, b0);
c1 = _mm_unpacklo_epi32(c0, d0);
d1 = _mm_unpackhi_epi32(c0, d0);
e1 = _mm_unpacklo_epi32(e0, f0);
f1 = _mm_unpackhi_epi32(e0, f0);
g1 = _mm_unpacklo_epi32(g0, h0);
h1 = _mm_unpackhi_epi32(g0, h0);
// We don't have a storeh instruction for integers, so interpret
// as a float. Have a storel (_mm_storel_epi64).
as = (__m128 *) &a1;
bs = (__m128 *) &b1;
cs = (__m128 *) &c1;
ds = (__m128 *) &d1;
es = (__m128 *) &e1;
fs = (__m128 *) &f1;
gs = (__m128 *) &g1;
hs = (__m128 *) &h1;
_mm_storel_pi((__m64 *) &out_b[(jj + 0) * nrows + ii], *as);
_mm_storel_pi((__m64 *) &out_b[(jj + 2) * nrows + ii], *bs);
_mm_storel_pi((__m64 *) &out_b[(jj + 4) * nrows + ii], *cs);
_mm_storel_pi((__m64 *) &out_b[(jj + 6) * nrows + ii], *ds);
_mm_storel_pi((__m64 *) &out_b[(jj + 8) * nrows + ii], *es);
_mm_storel_pi((__m64 *) &out_b[(jj + 10) * nrows + ii], *fs);
_mm_storel_pi((__m64 *) &out_b[(jj + 12) * nrows + ii], *gs);
_mm_storel_pi((__m64 *) &out_b[(jj + 14) * nrows + ii], *hs);
_mm_storeh_pi((__m64 *) &out_b[(jj + 1) * nrows + ii], *as);
_mm_storeh_pi((__m64 *) &out_b[(jj + 3) * nrows + ii], *bs);
_mm_storeh_pi((__m64 *) &out_b[(jj + 5) * nrows + ii], *cs);
_mm_storeh_pi((__m64 *) &out_b[(jj + 7) * nrows + ii], *ds);
_mm_storeh_pi((__m64 *) &out_b[(jj + 9) * nrows + ii], *es);
_mm_storeh_pi((__m64 *) &out_b[(jj + 11) * nrows + ii], *fs);
_mm_storeh_pi((__m64 *) &out_b[(jj + 13) * nrows + ii], *gs);
_mm_storeh_pi((__m64 *) &out_b[(jj + 15) * nrows + ii], *hs);
}
for (jj = nbyte_row - nbyte_row % 16; jj < nbyte_row; jj ++) {
out_b[jj * nrows + ii + 0] = in_b[(ii + 0)*nbyte_row + jj];
out_b[jj * nrows + ii + 1] = in_b[(ii + 1)*nbyte_row + jj];
out_b[jj * nrows + ii + 2] = in_b[(ii + 2)*nbyte_row + jj];
out_b[jj * nrows + ii + 3] = in_b[(ii + 3)*nbyte_row + jj];
out_b[jj * nrows + ii + 4] = in_b[(ii + 4)*nbyte_row + jj];
out_b[jj * nrows + ii + 5] = in_b[(ii + 5)*nbyte_row + jj];
out_b[jj * nrows + ii + 6] = in_b[(ii + 6)*nbyte_row + jj];
out_b[jj * nrows + ii + 7] = in_b[(ii + 7)*nbyte_row + jj];
}
}
return size * elem_size;
}
/* Shuffle bits within the bytes of eight element blocks. */
int64_t bshuf_shuffle_bit_eightelem_SSE(const void* in, void* out, const size_t size,
const size_t elem_size) {
CHECK_MULT_EIGHT(size);
// With a bit of care, this could be written such that such that it is
// in_buf = out_buf safe.
const char* in_b = (const char*) in;
uint16_t* out_ui16 = (uint16_t*) out;
size_t ii, jj, kk;
size_t nbyte = elem_size * size;
__m128i xmm;
int32_t bt;
if (elem_size % 2) {
bshuf_shuffle_bit_eightelem_scal(in, out, size, elem_size);
} else {
for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
ii += 8 * elem_size) {
for (jj = 0; jj + 15 < 8 * elem_size; jj += 16) {
xmm = _mm_loadu_si128((__m128i *) &in_b[ii + jj]);
for (kk = 0; kk < 8; kk++) {
bt = _mm_movemask_epi8(xmm);
xmm = _mm_slli_epi16(xmm, 1);
size_t ind = (ii + jj / 8 + (7 - kk) * elem_size);
out_ui16[ind / 2] = bt;
}
}
}
}
return size * elem_size;
}
/* Untranspose bits within elements. */
int64_t bshuf_untrans_bit_elem_SSE(const void* in, void* out, const size_t size,
const size_t elem_size) {
int64_t count;
CHECK_MULT_EIGHT(size);
void* tmp_buf = malloc(size * elem_size);
if (tmp_buf == NULL) return -1;
count = bshuf_trans_byte_bitrow_SSE(in, tmp_buf, size, elem_size);
CHECK_ERR_FREE(count, tmp_buf);
count = bshuf_shuffle_bit_eightelem_SSE(tmp_buf, out, size, elem_size);
free(tmp_buf);
return count;
}
#else // #ifdef USESSE2
int64_t bshuf_untrans_bit_elem_SSE(const void* in, void* out, const size_t size,
const size_t elem_size) {
return -11;
}
int64_t bshuf_trans_bit_elem_SSE(const void* in, void* out, const size_t size,
const size_t elem_size) {
return -11;
}
int64_t bshuf_trans_byte_bitrow_SSE(const void* in, void* out, const size_t size,
const size_t elem_size) {
return -11;
}
int64_t bshuf_trans_bit_byte_SSE(const void* in, void* out, const size_t size,
const size_t elem_size) {
return -11;
}
int64_t bshuf_trans_byte_elem_SSE(const void* in, void* out, const size_t size,
const size_t elem_size) {
return -11;
}
int64_t bshuf_trans_byte_elem_SSE_64(const void* in, void* out, const size_t size) {
return -11;
}
int64_t bshuf_trans_byte_elem_SSE_32(const void* in, void* out, const size_t size) {
return -11;
}
int64_t bshuf_trans_byte_elem_SSE_16(const void* in, void* out, const size_t size) {
return -11;
}
int64_t bshuf_shuffle_bit_eightelem_SSE(const void* in, void* out, const size_t size,
const size_t elem_size) {
return -11;
}
#endif // #ifdef USESSE2
/* ---- Code that requires AVX2. Intel Haswell (2013) and later. ---- */
/* ---- Worker code that uses AVX2 ----
*
* The following code makes use of the AVX2 instruction set and specialized
* 32 byte registers. The AVX2 instructions are present on newer x86
* processors. The first Intel processor microarchitecture supporting AVX2 was
* Haswell (2013).
*
*/
#ifdef USEAVX2
/* Transpose bits within bytes. */
int64_t bshuf_trans_bit_byte_AVX(const void* in, void* out, const size_t size,
const size_t elem_size) {
size_t ii, kk;
const char* in_b = (const char*) in;
char* out_b = (char*) out;
int32_t* out_i32;
size_t nbyte = elem_size * size;
int64_t count;
__m256i ymm;
int32_t bt;
for (ii = 0; ii + 31 < nbyte; ii += 32) {
ymm = _mm256_loadu_si256((__m256i *) &in_b[ii]);
for (kk = 0; kk < 8; kk++) {
bt = _mm256_movemask_epi8(ymm);
ymm = _mm256_slli_epi16(ymm, 1);
out_i32 = (int32_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
*out_i32 = bt;
}
}
count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
nbyte - nbyte % 32);
return count;
}
/* Transpose bits within elements. */
int64_t bshuf_trans_bit_elem_AVX(const void* in, void* out, const size_t size,
const size_t elem_size) {
int64_t count;
CHECK_MULT_EIGHT(size);
void* tmp_buf = malloc(size * elem_size);
if (tmp_buf == NULL) return -1;
count = bshuf_trans_byte_elem_SSE(in, out, size, elem_size);
CHECK_ERR_FREE(count, tmp_buf);
count = bshuf_trans_bit_byte_AVX(out, tmp_buf, size, elem_size);
CHECK_ERR_FREE(count, tmp_buf);
count = bshuf_trans_bitrow_eight(tmp_buf, out, size, elem_size);
free(tmp_buf);
return count;
}
/* For data organized into a row for each bit (8 * elem_size rows), transpose
* the bytes. */
int64_t bshuf_trans_byte_bitrow_AVX(const void* in, void* out, const size_t size,
const size_t elem_size) {
size_t hh, ii, jj, kk, mm;
const char* in_b = (const char*) in;
char* out_b = (char*) out;
CHECK_MULT_EIGHT(size);
size_t nrows = 8 * elem_size;
size_t nbyte_row = size / 8;
if (elem_size % 4) return bshuf_trans_byte_bitrow_SSE(in, out, size,
elem_size);
__m256i ymm_0[8];
__m256i ymm_1[8];
__m256i ymm_storeage[8][4];
for (jj = 0; jj + 31 < nbyte_row; jj += 32) {
for (ii = 0; ii + 3 < elem_size; ii += 4) {
for (hh = 0; hh < 4; hh ++) {
for (kk = 0; kk < 8; kk ++){
ymm_0[kk] = _mm256_loadu_si256((__m256i *) &in_b[
(ii * 8 + hh * 8 + kk) * nbyte_row + jj]);
}
for (kk = 0; kk < 4; kk ++){
ymm_1[kk] = _mm256_unpacklo_epi8(ymm_0[kk * 2],
ymm_0[kk * 2 + 1]);
ymm_1[kk + 4] = _mm256_unpackhi_epi8(ymm_0[kk * 2],
ymm_0[kk * 2 + 1]);
}
for (kk = 0; kk < 2; kk ++){
for (mm = 0; mm < 2; mm ++){
ymm_0[kk * 4 + mm] = _mm256_unpacklo_epi16(
ymm_1[kk * 4 + mm * 2],
ymm_1[kk * 4 + mm * 2 + 1]);
ymm_0[kk * 4 + mm + 2] = _mm256_unpackhi_epi16(
ymm_1[kk * 4 + mm * 2],
ymm_1[kk * 4 + mm * 2 + 1]);
}
}
for (kk = 0; kk < 4; kk ++){
ymm_1[kk * 2] = _mm256_unpacklo_epi32(ymm_0[kk * 2],
ymm_0[kk * 2 + 1]);
ymm_1[kk * 2 + 1] = _mm256_unpackhi_epi32(ymm_0[kk * 2],
ymm_0[kk * 2 + 1]);
}
for (kk = 0; kk < 8; kk ++){
ymm_storeage[kk][hh] = ymm_1[kk];
}
}
for (mm = 0; mm < 8; mm ++) {
for (kk = 0; kk < 4; kk ++){
ymm_0[kk] = ymm_storeage[mm][kk];
}
ymm_1[0] = _mm256_unpacklo_epi64(ymm_0[0], ymm_0[1]);
ymm_1[1] = _mm256_unpacklo_epi64(ymm_0[2], ymm_0[3]);
ymm_1[2] = _mm256_unpackhi_epi64(ymm_0[0], ymm_0[1]);
ymm_1[3] = _mm256_unpackhi_epi64(ymm_0[2], ymm_0[3]);
ymm_0[0] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 32);
ymm_0[1] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 32);
ymm_0[2] = _mm256_permute2x128_si256(ymm_1[0], ymm_1[1], 49);
ymm_0[3] = _mm256_permute2x128_si256(ymm_1[2], ymm_1[3], 49);
_mm256_storeu_si256((__m256i *) &out_b[
(jj + mm * 2 + 0 * 16) * nrows + ii * 8], ymm_0[0]);
_mm256_storeu_si256((__m256i *) &out_b[
(jj + mm * 2 + 0 * 16 + 1) * nrows + ii * 8], ymm_0[1]);
_mm256_storeu_si256((__m256i *) &out_b[
(jj + mm * 2 + 1 * 16) * nrows + ii * 8], ymm_0[2]);
_mm256_storeu_si256((__m256i *) &out_b[
(jj + mm * 2 + 1 * 16 + 1) * nrows + ii * 8], ymm_0[3]);
}
}
}
for (ii = 0; ii < nrows; ii ++ ) {
for (jj = nbyte_row - nbyte_row % 32; jj < nbyte_row; jj ++) {
out_b[jj * nrows + ii] = in_b[ii * nbyte_row + jj];
}
}
return size * elem_size;
}
/* Shuffle bits within the bytes of eight element blocks. */
int64_t bshuf_shuffle_bit_eightelem_AVX(const void* in, void* out, const size_t size,
const size_t elem_size) {
CHECK_MULT_EIGHT(size);
// With a bit of care, this could be written such that such that it is
// in_buf = out_buf safe.
const char* in_b = (const char*) in;
char* out_b = (char*) out;
size_t ii, jj, kk;
size_t nbyte = elem_size * size;
__m256i ymm;
int32_t bt;
if (elem_size % 4) {
return bshuf_shuffle_bit_eightelem_SSE(in, out, size, elem_size);
} else {
for (jj = 0; jj + 31 < 8 * elem_size; jj += 32) {
for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
ii += 8 * elem_size) {
ymm = _mm256_loadu_si256((__m256i *) &in_b[ii + jj]);
for (kk = 0; kk < 8; kk++) {
bt = _mm256_movemask_epi8(ymm);
ymm = _mm256_slli_epi16(ymm, 1);
size_t ind = (ii + jj / 8 + (7 - kk) * elem_size);
* (int32_t *) &out_b[ind] = bt;
}
}
}
}
return size * elem_size;
}
/* Untranspose bits within elements. */
int64_t bshuf_untrans_bit_elem_AVX(const void* in, void* out, const size_t size,
const size_t elem_size) {
int64_t count;
CHECK_MULT_EIGHT(size);
void* tmp_buf = malloc(size * elem_size);
if (tmp_buf == NULL) return -1;
count = bshuf_trans_byte_bitrow_AVX(in, tmp_buf, size, elem_size);
CHECK_ERR_FREE(count, tmp_buf);
count = bshuf_shuffle_bit_eightelem_AVX(tmp_buf, out, size, elem_size);
free(tmp_buf);
return count;
}
#else // #ifdef USEAVX2
int64_t bshuf_trans_bit_byte_AVX(const void* in, void* out, const size_t size,
const size_t elem_size) {
return -12;
}
int64_t bshuf_trans_bit_elem_AVX(const void* in, void* out, const size_t size,
const size_t elem_size) {
return -12;
}
int64_t bshuf_trans_byte_bitrow_AVX(const void* in, void* out, const size_t size,
const size_t elem_size) {
return -12;
}
int64_t bshuf_shuffle_bit_eightelem_AVX(const void* in, void* out, const size_t size,
const size_t elem_size) {
return -12;
}
int64_t bshuf_untrans_bit_elem_AVX(const void* in, void* out, const size_t size,
const size_t elem_size) {
return -12;
}
#endif // #ifdef USEAVX2
/* ---- Drivers selecting best instruction set at compile time. ---- */
int64_t bshuf_trans_bit_elem(const void* in, void* out, const size_t size,
const size_t elem_size) {
int64_t count;
#ifdef USEAVX2
count = bshuf_trans_bit_elem_AVX(in, out, size, elem_size);
#elif defined(USESSE2)
count = bshuf_trans_bit_elem_SSE(in, out, size, elem_size);
#elif defined(USEARMNEON)
count = bshuf_trans_bit_elem_NEON(in, out, size, elem_size);
#else
count = bshuf_trans_bit_elem_scal(in, out, size, elem_size);
#endif
return count;
}
int64_t bshuf_untrans_bit_elem(const void* in, void* out, const size_t size,
const size_t elem_size) {
int64_t count;
#ifdef USEAVX2
count = bshuf_untrans_bit_elem_AVX(in, out, size, elem_size);
#elif defined(USESSE2)
count = bshuf_untrans_bit_elem_SSE(in, out, size, elem_size);
#elif defined(USEARMNEON)
count = bshuf_untrans_bit_elem_NEON(in, out, size, elem_size);
#else
count = bshuf_untrans_bit_elem_scal(in, out, size, elem_size);
#endif
return count;
}
/* ---- Wrappers for implementing blocking ---- */
/* Wrap a function for processing a single block to process an entire buffer in
* parallel. */
int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out, \
const size_t size, const size_t elem_size, size_t block_size) {
omp_size_t ii = 0;
int64_t err = 0;
int64_t count, cum_count=0;
size_t last_block_size;
size_t leftover_bytes;
size_t this_iter;
char *last_in;
char *last_out;
ioc_chain C;
ioc_init(&C, in, out);
if (block_size == 0) {
block_size = bshuf_default_block_size(elem_size);
}
if (block_size % BSHUF_BLOCKED_MULT) return -81;
#if defined(_OPENMP)
#pragma omp parallel for schedule(dynamic, 1) \
private(count) reduction(+ : cum_count)
#endif
for (ii = 0; ii < (omp_size_t)( size / block_size ); ii ++) {
count = fun(&C, block_size, elem_size);
if (count < 0) err = count;
cum_count += count;
}
last_block_size = size % block_size;
last_block_size = last_block_size - last_block_size % BSHUF_BLOCKED_MULT;
if (last_block_size) {
count = fun(&C, last_block_size, elem_size);
if (count < 0) err = count;
cum_count += count;
}
if (err < 0) return err;
leftover_bytes = size % BSHUF_BLOCKED_MULT * elem_size;
//this_iter;
last_in = (char *) ioc_get_in(&C, &this_iter);
ioc_set_next_in(&C, &this_iter, (void *) (last_in + leftover_bytes));
last_out = (char *) ioc_get_out(&C, &this_iter);
ioc_set_next_out(&C, &this_iter, (void *) (last_out + leftover_bytes));
memcpy(last_out, last_in, leftover_bytes);
ioc_destroy(&C);
return cum_count + leftover_bytes;
}
/* Bitshuffle a single block. */
int64_t bshuf_bitshuffle_block(ioc_chain *C_ptr, \
const size_t size, const size_t elem_size) {
size_t this_iter;
const void *in;
void *out;
int64_t count;
in = ioc_get_in(C_ptr, &this_iter);
ioc_set_next_in(C_ptr, &this_iter,
(void*) ((char*) in + size * elem_size));
out = ioc_get_out(C_ptr, &this_iter);
ioc_set_next_out(C_ptr, &this_iter,
(void *) ((char *) out + size * elem_size));
count = bshuf_trans_bit_elem(in, out, size, elem_size);
return count;
}
/* Bitunshuffle a single block. */
int64_t bshuf_bitunshuffle_block(ioc_chain* C_ptr, \
const size_t size, const size_t elem_size) {
size_t this_iter;
const void *in;
void *out;
int64_t count;
in = ioc_get_in(C_ptr, &this_iter);
ioc_set_next_in(C_ptr, &this_iter,
(void*) ((char*) in + size * elem_size));
out = ioc_get_out(C_ptr, &this_iter);
ioc_set_next_out(C_ptr, &this_iter,
(void *) ((char *) out + size * elem_size));
count = bshuf_untrans_bit_elem(in, out, size, elem_size);
return count;
}
/* Write a 64 bit unsigned integer to a buffer in big endian order. */
void bshuf_write_uint64_BE(void* buf, uint64_t num) {
int ii;
uint8_t* b = (uint8_t*) buf;
uint64_t pow28 = 1 << 8;
for (ii = 7; ii >= 0; ii--) {
b[ii] = num % pow28;
num = num / pow28;
}
}
/* Read a 64 bit unsigned integer from a buffer big endian order. */
uint64_t bshuf_read_uint64_BE(void* buf) {
int ii;
uint8_t* b = (uint8_t*) buf;
uint64_t num = 0, pow28 = 1 << 8, cp = 1;
for (ii = 7; ii >= 0; ii--) {
num += b[ii] * cp;
cp *= pow28;
}
return num;
}
/* Write a 32 bit unsigned integer to a buffer in big endian order. */
void bshuf_write_uint32_BE(void* buf, uint32_t num) {
int ii;
uint8_t* b = (uint8_t*) buf;
uint32_t pow28 = 1 << 8;
for (ii = 3; ii >= 0; ii--) {
b[ii] = num % pow28;
num = num / pow28;
}
}
/* Read a 32 bit unsigned integer from a buffer big endian order. */
uint32_t bshuf_read_uint32_BE(const void* buf) {
int ii;
uint8_t* b = (uint8_t*) buf;
uint32_t num = 0, pow28 = 1 << 8, cp = 1;
for (ii = 3; ii >= 0; ii--) {
num += b[ii] * cp;
cp *= pow28;
}
return num;
}
/* ---- Public functions ----
*
* See header file for description and usage.
*
*/
size_t bshuf_default_block_size(const size_t elem_size) {
// This function needs to be absolutely stable between versions.
// Otherwise encoded data will not be decodable.
size_t block_size = BSHUF_TARGET_BLOCK_SIZE_B / elem_size;
// Ensure it is a required multiple.
block_size = (block_size / BSHUF_BLOCKED_MULT) * BSHUF_BLOCKED_MULT;
return MAX(block_size, BSHUF_MIN_RECOMMEND_BLOCK);
}
int64_t bshuf_bitshuffle(const void* in, void* out, const size_t size,
const size_t elem_size, size_t block_size) {
return bshuf_blocked_wrap_fun(&bshuf_bitshuffle_block, in, out, size,
elem_size, block_size);
}
int64_t bshuf_bitunshuffle(const void* in, void* out, const size_t size,
const size_t elem_size, size_t block_size) {
return bshuf_blocked_wrap_fun(&bshuf_bitunshuffle_block, in, out, size,
elem_size, block_size);
}
#undef TRANS_BIT_8X8
#undef TRANS_ELEM_TYPE
#undef MAX
#undef CHECK_MULT_EIGHT
#undef CHECK_ERR_FREE
#undef USESSE2
#undef USEAVX2
bitshuffle-0.3.5/src/bitshuffle_core.h000066400000000000000000000100071337005776700177720ustar00rootroot00000000000000/*
* Bitshuffle - Filter for improving compression of typed binary data.
*
* This file is part of Bitshuffle
* Author: Kiyoshi Masui
* Website: http://www.github.com/kiyo-masui/bitshuffle
* Created: 2014
*
* See LICENSE file for details about copyright and rights to use.
*
*
* Header File
*
* Worker routines return an int64_t which is the number of bytes processed
* if positive or an error code if negative.
*
* Error codes:
* -1 : Failed to allocate memory.
* -11 : Missing SSE.
* -12 : Missing AVX.
* -13 : Missing Arm Neon.
* -80 : Input size not a multiple of 8.
* -81 : block_size not multiple of 8.
* -91 : Decompression error, wrong number of bytes processed.
* -1YYY : Error internal to compression routine with error code -YYY.
*/
#ifndef BITSHUFFLE_CORE_H
#define BITSHUFFLE_CORE_H
// We assume GNU g++ defining `__cplusplus` has stdint.h
#if (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199900L) || defined(__cplusplus)
#include
#else
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
typedef signed int int32_t;
typedef unsigned long long uint64_t;
typedef long long int64_t;
#endif
#include
// These are usually set in the setup.py.
#ifndef BSHUF_VERSION_MAJOR
#define BSHUF_VERSION_MAJOR 0
#define BSHUF_VERSION_MINOR 3
#define BSHUF_VERSION_POINT 5
#endif
#ifdef __cplusplus
extern "C" {
#endif
/* --- bshuf_using_SSE2 ----
*
* Whether routines where compiled with the SSE2 instruction set.
*
* Returns
* -------
* 1 if using SSE2, 0 otherwise.
*
*/
int bshuf_using_SSE2(void);
/* ---- bshuf_using_AVX2 ----
*
* Whether routines where compiled with the AVX2 instruction set.
*
* Returns
* -------
* 1 if using AVX2, 0 otherwise.
*
*/
int bshuf_using_AVX2(void);
/* ---- bshuf_default_block_size ----
*
* The default block size as function of element size.
*
* This is the block size used by the blocked routines (any routine
* taking a *block_size* argument) when the block_size is not provided
* (zero is passed).
*
* The results of this routine are guaranteed to be stable such that
* shuffled/compressed data can always be decompressed.
*
* Parameters
* ----------
* elem_size : element size of data to be shuffled/compressed.
*
*/
size_t bshuf_default_block_size(const size_t elem_size);
/* ---- bshuf_bitshuffle ----
*
* Bitshuffle the data.
*
* Transpose the bits within elements, in blocks of *block_size*
* elements.
*
* Parameters
* ----------
* in : input buffer, must be of size * elem_size bytes
* out : output buffer, must be of size * elem_size bytes
* size : number of elements in input
* elem_size : element size of typed data
* block_size : Do transpose in blocks of this many elements. Pass 0 to
* select automatically (recommended).
*
* Returns
* -------
* number of bytes processed, negative error-code if failed.
*
*/
int64_t bshuf_bitshuffle(const void* in, void* out, const size_t size,
const size_t elem_size, size_t block_size);
/* ---- bshuf_bitunshuffle ----
*
* Unshuffle bitshuffled data.
*
* Untranspose the bits within elements, in blocks of *block_size*
* elements.
*
* To properly unshuffle bitshuffled data, *size*, *elem_size* and *block_size*
* must match the parameters used to shuffle the data.
*
* Parameters
* ----------
* in : input buffer, must be of size * elem_size bytes
* out : output buffer, must be of size * elem_size bytes
* size : number of elements in input
* elem_size : element size of typed data
* block_size : Do transpose in blocks of this many elements. Pass 0 to
* select automatically (recommended).
*
* Returns
* -------
* number of bytes processed, negative error-code if failed.
*
*/
int64_t bshuf_bitunshuffle(const void* in, void* out, const size_t size,
const size_t elem_size, size_t block_size);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // BITSHUFFLE_CORE_H
bitshuffle-0.3.5/src/bitshuffle_internals.h000066400000000000000000000042251337005776700210460ustar00rootroot00000000000000/*
* Bitshuffle - Filter for improving compression of typed binary data.
*
* This file is part of Bitshuffle
* Author: Kiyoshi Masui
* Website: http://www.github.com/kiyo-masui/bitshuffle
* Created: 2014
*
* See LICENSE file for details about copyright and rights to use.
*/
#ifndef BITSHUFFLE_INTERNALS_H
#define BITSHUFFLE_INTERNALS_H
// We assume GNU g++ defining `__cplusplus` has stdint.h
#if (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199900L) || defined(__cplusplus)
#include
#else
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
typedef signed int int32_t;
typedef unsigned long long uint64_t;
typedef long long int64_t;
#endif
#include
#include "iochain.h"
// Constants.
#ifndef BSHUF_MIN_RECOMMEND_BLOCK
#define BSHUF_MIN_RECOMMEND_BLOCK 128
#define BSHUF_BLOCKED_MULT 8 // Block sizes must be multiple of this.
#define BSHUF_TARGET_BLOCK_SIZE_B 8192
#endif
// Macros.
#define CHECK_ERR_FREE(count, buf) if (count < 0) { free(buf); return count; }
#ifdef __cplusplus
extern "C" {
#endif
/* ---- Utility functions for internal use only ---- */
int64_t bshuf_trans_bit_elem(const void* in, void* out, const size_t size,
const size_t elem_size);
/* Read a 32 bit unsigned integer from a buffer big endian order. */
uint32_t bshuf_read_uint32_BE(const void* buf);
/* Write a 32 bit unsigned integer to a buffer in big endian order. */
void bshuf_write_uint32_BE(void* buf, uint32_t num);
int64_t bshuf_untrans_bit_elem(const void* in, void* out, const size_t size,
const size_t elem_size);
/* Function definition for worker functions that process a single block. */
typedef int64_t (*bshufBlockFunDef)(ioc_chain* C_ptr,
const size_t size, const size_t elem_size);
/* Wrap a function for processing a single block to process an entire buffer in
* parallel. */
int64_t bshuf_blocked_wrap_fun(bshufBlockFunDef fun, const void* in, void* out,
const size_t size, const size_t elem_size, size_t block_size);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // BITSHUFFLE_INTERNALS_H
bitshuffle-0.3.5/src/bshuf_h5filter.c000066400000000000000000000150501337005776700175360ustar00rootroot00000000000000/*
* Bitshuffle HDF5 filter
*
* This file is part of Bitshuffle
* Author: Kiyoshi Masui
* Website: http://www.github.com/kiyo-masui/bitshuffle
* Created: 2014
*
* See LICENSE file for details about copyright and rights to use.
*
*/
#include "bitshuffle.h"
#include "bshuf_h5filter.h"
#define PUSH_ERR(func, minor, str) \
H5Epush1(__FILE__, func, __LINE__, H5E_PLINE, minor, str)
// Prototypes from bitshuffle.c
void bshuf_write_uint64_BE(void* buf, uint64_t num);
uint64_t bshuf_read_uint64_BE(void* buf);
void bshuf_write_uint32_BE(void* buf, uint32_t num);
uint32_t bshuf_read_uint32_BE(const void* buf);
// Only called on compresion, not on reverse.
herr_t bshuf_h5_set_local(hid_t dcpl, hid_t type, hid_t space){
herr_t r;
size_t ii;
unsigned int elem_size;
unsigned int flags;
size_t nelements = 8;
size_t nelem_max = 11;
unsigned values[] = {0,0,0,0,0,0,0,0,0,0,0};
unsigned tmp_values[] = {0,0,0,0,0,0,0,0};
char msg[80];
r = H5Pget_filter_by_id2(dcpl, BSHUF_H5FILTER, &flags, &nelements,
tmp_values, 0, NULL, NULL);
if(r<0) return -1;
// First 3 slots reserved. Move any passed options to higher addresses.
for (ii=0; ii < nelements && ii + 3 < nelem_max; ii++) {
values[ii + 3] = tmp_values[ii];
}
nelements = 3 + nelements;
values[0] = BSHUF_VERSION_MAJOR;
values[1] = BSHUF_VERSION_MINOR;
elem_size = H5Tget_size(type);
if(elem_size <= 0) {
PUSH_ERR("bshuf_h5_set_local", H5E_CALLBACK,
"Invalid element size.");
return -1;
}
values[2] = elem_size;
// Validate user supplied arguments.
if (nelements > 3) {
if (values[3] % 8 || values[3] < 0) {
sprintf(msg, "Error in bitshuffle. Invalid block size: %d.",
values[3]);
PUSH_ERR("bshuf_h5_set_local", H5E_CALLBACK, msg);
return -1;
}
}
if (nelements > 4) {
switch (values[4]) {
case 0:
break;
case BSHUF_H5_COMPRESS_LZ4:
break;
default:
PUSH_ERR("bshuf_h5_set_local", H5E_CALLBACK,
"Invalid bitshuffle compression.");
}
}
r = H5Pmodify_filter(dcpl, BSHUF_H5FILTER, flags, nelements, values);
if(r<0) return -1;
return 1;
}
size_t bshuf_h5_filter(unsigned int flags, size_t cd_nelmts,
const unsigned int cd_values[], size_t nbytes,
size_t *buf_size, void **buf) {
size_t size, elem_size;
int err;
char msg[80];
size_t block_size = 0;
size_t buf_size_out, nbytes_uncomp, nbytes_out;
char* in_buf = *buf;
void *out_buf;
if (cd_nelmts < 3) {
PUSH_ERR("bshuf_h5_filter", H5E_CALLBACK,
"Not enough parameters.");
return 0;
}
elem_size = cd_values[2];
// User specified block size.
if (cd_nelmts > 3) block_size = cd_values[3];
if (block_size == 0) block_size = bshuf_default_block_size(elem_size);
// Compression in addition to bitshiffle.
if (cd_nelmts > 4 && cd_values[4] == BSHUF_H5_COMPRESS_LZ4) {
if (flags & H5Z_FLAG_REVERSE) {
// First eight bytes is the number of bytes in the output buffer,
// little endian.
nbytes_uncomp = bshuf_read_uint64_BE(in_buf);
// Override the block size with the one read from the header.
block_size = bshuf_read_uint32_BE((const char*) in_buf + 8) / elem_size;
// Skip over the header.
in_buf += 12;
buf_size_out = nbytes_uncomp;
} else {
nbytes_uncomp = nbytes;
buf_size_out = bshuf_compress_lz4_bound(nbytes_uncomp / elem_size,
elem_size, block_size) + 12;
}
} else {
nbytes_uncomp = nbytes;
buf_size_out = nbytes;
}
// TODO, remove this restriction by memcopying the extra.
if (nbytes_uncomp % elem_size) {
PUSH_ERR("bshuf_h5_filter", H5E_CALLBACK,
"Non integer number of elements.");
return 0;
}
size = nbytes_uncomp / elem_size;
out_buf = malloc(buf_size_out);
if (out_buf == NULL) {
PUSH_ERR("bshuf_h5_filter", H5E_CALLBACK,
"Could not allocate output buffer.");
return 0;
}
if (cd_nelmts > 4 && cd_values[4] == BSHUF_H5_COMPRESS_LZ4) {
if (flags & H5Z_FLAG_REVERSE) {
// Bit unshuffle/decompress.
err = bshuf_decompress_lz4(in_buf, out_buf, size, elem_size, block_size);
nbytes_out = nbytes_uncomp;
} else {
// Bit shuffle/compress.
// Write the header, described in
// http://www.hdfgroup.org/services/filters/HDF5_LZ4.pdf.
// Techincally we should be using signed integers instead of
// unsigned ones, however for valid inputs (positive numbers) these
// have the same representation.
bshuf_write_uint64_BE(out_buf, nbytes_uncomp);
bshuf_write_uint32_BE((char*) out_buf + 8, block_size * elem_size);
err = bshuf_compress_lz4(in_buf, (char*) out_buf + 12, size,
elem_size, block_size); nbytes_out = err + 12; } } else {
if (flags & H5Z_FLAG_REVERSE) {
// Bit unshuffle.
err = bshuf_bitunshuffle(in_buf, out_buf, size, elem_size,
block_size); } else {
// Bit shuffle.
err = bshuf_bitshuffle(in_buf, out_buf, size, elem_size,
block_size); } nbytes_out = nbytes; }
//printf("nb_in %d, nb_uncomp %d, nb_out %d, buf_out %d, block %d\n",
//nbytes, nbytes_uncomp, nbytes_out, buf_size_out, block_size);
if (err < 0) {
sprintf(msg, "Error in bitshuffle with error code %d.", err);
PUSH_ERR("bshuf_h5_filter", H5E_CALLBACK, msg);
free(out_buf);
return 0;
} else {
free(*buf);
*buf = out_buf;
*buf_size = buf_size_out;
return nbytes_out;
}
}
H5Z_class_t bshuf_H5Filter[1] = {{
H5Z_CLASS_T_VERS,
(H5Z_filter_t)(BSHUF_H5FILTER),
1, 1,
"bitshuffle; see https://github.com/kiyo-masui/bitshuffle",
NULL,
(H5Z_set_local_func_t)(bshuf_h5_set_local),
(H5Z_func_t)(bshuf_h5_filter)
}};
int bshuf_register_h5filter(void){
int retval;
retval = H5Zregister(bshuf_H5Filter);
if(retval<0){
PUSH_ERR("bshuf_register_h5filter",
H5E_CANTREGISTER, "Can't register bitshuffle filter");
}
return retval;
}
bitshuffle-0.3.5/src/bshuf_h5filter.h000066400000000000000000000027321337005776700175460ustar00rootroot00000000000000/*
* Bitshuffle HDF5 filter
*
* This file is part of Bitshuffle
* Author: Kiyoshi Masui
* Website: http://www.github.com/kiyo-masui/bitshuffle
* Created: 2014
*
* See LICENSE file for details about copyright and rights to use.
*
*
* Header File
*
* Filter Options
* --------------
* block_size (option slot 0) : interger (optional)
* What block size to use (in elements not bytes). Default is 0,
* for which bitshuffle will pick a block size with a target of 8kb.
* Compression (option slot 1) : 0 or BSHUF_H5_COMPRESS_LZ4
* Whether to apply LZ4 compression to the data after bitshuffling.
* This is much faster than applying compression as a second filter
* because it is done when the small block of data is already in the
* L1 cache.
*
* For LZ4 compression, the compressed format of the data is the same as
* for the normal LZ4 filter described in
* http://www.hdfgroup.org/services/filters/HDF5_LZ4.pdf.
*
*/
#ifndef BSHUF_H5FILTER_H
#define BSHUF_H5FILTER_H
#define H5Z_class_t_vers 2
#include "hdf5.h"
#define BSHUF_H5FILTER 32008
#define BSHUF_H5_COMPRESS_LZ4 2
extern H5Z_class_t bshuf_H5Filter[1];
/* ---- bshuf_register_h5filter ----
*
* Register the bitshuffle HDF5 filter within the HDF5 library.
*
* Call this before using the bitshuffle HDF5 filter from C unless
* using dynamically loaded filters.
*
*/
int bshuf_register_h5filter(void);
#endif // BSHUF_H5FILTER_H
bitshuffle-0.3.5/src/bshuf_h5plugin.c000066400000000000000000000007461337005776700175550ustar00rootroot00000000000000/*
* Dynamically loaded filter plugin for HDF5 Bitshuffle filter.
*
* This file is part of Bitshuffle
* Author: Kiyoshi Masui
* Website: http://www.github.com/kiyo-masui/bitshuffle
* Created: 2014
*
* See LICENSE file for details about copyright and rights to use.
*
*/
#include "bshuf_h5filter.h"
#include "H5PLextern.h"
H5PL_type_t H5PLget_plugin_type(void) {return H5PL_TYPE_FILTER;}
const void* H5PLget_plugin_info(void) {return bshuf_H5Filter;}
bitshuffle-0.3.5/src/iochain.c000066400000000000000000000046641337005776700162500ustar00rootroot00000000000000/*
* IOchain - Distribute a chain of dependant IO events amoung threads.
*
* This file is part of Bitshuffle
* Author: Kiyoshi Masui
* Website: http://www.github.com/kiyo-masui/bitshuffle
* Created: 2014
*
* See LICENSE file for details about copyright and rights to use.
*
*/
#include
#include "iochain.h"
void ioc_init(ioc_chain *C, const void *in_ptr_0, void *out_ptr_0) {
#ifdef _OPENMP
omp_init_lock(&C->next_lock);
for (size_t ii = 0; ii < IOC_SIZE; ii ++) {
omp_init_lock(&(C->in_pl[ii].lock));
omp_init_lock(&(C->out_pl[ii].lock));
}
#endif
C->next = 0;
C->in_pl[0].ptr = in_ptr_0;
C->out_pl[0].ptr = out_ptr_0;
}
void ioc_destroy(ioc_chain *C) {
#ifdef _OPENMP
omp_destroy_lock(&C->next_lock);
for (size_t ii = 0; ii < IOC_SIZE; ii ++) {
omp_destroy_lock(&(C->in_pl[ii].lock));
omp_destroy_lock(&(C->out_pl[ii].lock));
}
#endif
}
const void * ioc_get_in(ioc_chain *C, size_t *this_iter) {
#ifdef _OPENMP
omp_set_lock(&C->next_lock);
#pragma omp flush
#endif
*this_iter = C->next;
C->next ++;
#ifdef _OPENMP
omp_set_lock(&(C->in_pl[*this_iter % IOC_SIZE].lock));
omp_set_lock(&(C->in_pl[(*this_iter + 1) % IOC_SIZE].lock));
omp_set_lock(&(C->out_pl[(*this_iter + 1) % IOC_SIZE].lock));
omp_unset_lock(&C->next_lock);
#endif
return C->in_pl[*this_iter % IOC_SIZE].ptr;
}
void ioc_set_next_in(ioc_chain *C, size_t* this_iter, void* in_ptr) {
C->in_pl[(*this_iter + 1) % IOC_SIZE].ptr = in_ptr;
#ifdef _OPENMP
omp_unset_lock(&(C->in_pl[(*this_iter + 1) % IOC_SIZE].lock));
#endif
}
void * ioc_get_out(ioc_chain *C, size_t *this_iter) {
#ifdef _OPENMP
omp_set_lock(&(C->out_pl[(*this_iter) % IOC_SIZE].lock));
#pragma omp flush
#endif
void *out_ptr = C->out_pl[*this_iter % IOC_SIZE].ptr;
#ifdef _OPENMP
omp_unset_lock(&(C->out_pl[(*this_iter) % IOC_SIZE].lock));
#endif
return out_ptr;
}
void ioc_set_next_out(ioc_chain *C, size_t *this_iter, void* out_ptr) {
C->out_pl[(*this_iter + 1) % IOC_SIZE].ptr = out_ptr;
#ifdef _OPENMP
omp_unset_lock(&(C->out_pl[(*this_iter + 1) % IOC_SIZE].lock));
// *in_pl[this_iter]* lock released at the end of the iteration to avoid being
// overtaken by previous threads and having *out_pl[this_iter]* corrupted.
// Especially worried about thread 0, iteration 0.
omp_unset_lock(&(C->in_pl[(*this_iter) % IOC_SIZE].lock));
#endif
}
bitshuffle-0.3.5/src/iochain.h000066400000000000000000000050711337005776700162460ustar00rootroot00000000000000/*
* IOchain - Distribute a chain of dependant IO events amoung threads.
*
* This file is part of Bitshuffle
* Author: Kiyoshi Masui
* Website: http://www.github.com/kiyo-masui/bitshuffle
* Created: 2014
*
* See LICENSE file for details about copyright and rights to use.
*
*
* Header File
*
* Similar in concept to a queue. Each task includes reading an input
* and writing output, but the location of the input/output (the pointers)
* depend on the previous item in the chain.
*
* This is designed for parallelizing blocked compression/decompression IO,
* where the destination of a compressed block depends on the compressed size
* of all previous blocks.
*
* Implemented with OpenMP locks.
*
*
* Usage
* -----
* - Call `ioc_init` in serial block.
* - Each thread should create a local variable *size_t this_iter* and
* pass its address to all function calls. Its value will be set
* inside the functions and is used to identify the thread.
* - Each thread must call each of the `ioc_get*` and `ioc_set*` methods
* exactly once per iteration, starting with `ioc_get_in` and ending
* with `ioc_set_next_out`.
* - The order (`ioc_get_in`, `ioc_set_next_in`, *work*, `ioc_get_out`,
* `ioc_set_next_out`, *work*) is most efficient.
* - Have each thread call `ioc_end_pop`.
* - `ioc_get_in` is blocked until the previous entry's
* `ioc_set_next_in` is called.
* - `ioc_get_out` is blocked until the previous entry's
* `ioc_set_next_out` is called.
* - There are no blocks on the very first iteration.
* - Call `ioc_destroy` in serial block.
* - Safe for num_threads >= IOC_SIZE (but less efficient).
*
*/
#ifndef IOCHAIN_H
#define IOCHAIN_H
#include
#ifdef _OPENMP
#include
#endif
#define IOC_SIZE 33
typedef struct ioc_ptr_and_lock {
#ifdef _OPENMP
omp_lock_t lock;
#endif
void *ptr;
} ptr_and_lock;
typedef struct ioc_const_ptr_and_lock {
#ifdef _OPENMP
omp_lock_t lock;
#endif
const void *ptr;
} const_ptr_and_lock;
typedef struct ioc_chain {
#ifdef _OPENMP
omp_lock_t next_lock;
#endif
size_t next;
const_ptr_and_lock in_pl[IOC_SIZE];
ptr_and_lock out_pl[IOC_SIZE];
} ioc_chain;
void ioc_init(ioc_chain *C, const void *in_ptr_0, void *out_ptr_0);
void ioc_destroy(ioc_chain *C);
const void * ioc_get_in(ioc_chain *C, size_t *this_iter);
void ioc_set_next_in(ioc_chain *C, size_t* this_iter, void* in_ptr);
void * ioc_get_out(ioc_chain *C, size_t *this_iter);
void ioc_set_next_out(ioc_chain *C, size_t *this_iter, void* out_ptr);
#endif // IOCHAIN_H
bitshuffle-0.3.5/src/lzf_h5plugin.c000066400000000000000000000016641337005776700172410ustar00rootroot00000000000000/*
* Dynamically loaded filter plugin for HDF5 LZF filter.
*
* This file is part of Bitshuffle
* Author: Kiyoshi Masui
* Website: http://www.github.com/kiyo-masui/bitshuffle
* Created: 2014
*
* See LICENSE file for details about copyright and rights to use.
*
*/
#define H5Z_class_t_vers 2
#include "lzf_filter.h"
#include "H5PLextern.h"
#include
size_t lzf_filter(unsigned flags, size_t cd_nelmts,
const unsigned cd_values[], size_t nbytes,
size_t *buf_size, void **buf);
herr_t lzf_set_local(hid_t dcpl, hid_t type, hid_t space);
H5Z_class_t lzf_H5Filter[1] = {{
H5Z_CLASS_T_VERS,
(H5Z_filter_t)(H5PY_FILTER_LZF),
1, 1,
"lzf",
NULL,
(H5Z_set_local_func_t)(lzf_set_local),
(H5Z_func_t)(lzf_filter)
}};
H5PL_type_t H5PLget_plugin_type(void) {return H5PL_TYPE_FILTER;}
const void* H5PLget_plugin_info(void) {return lzf_H5Filter;}